Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 1 | // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef BASE_JSON_JSON_PARSER_H_ |
| 6 | #define BASE_JSON_JSON_PARSER_H_ |
| 7 | |
| 8 | #include <string> |
| 9 | |
| 10 | #include "base/base_export.h" |
| 11 | #include "base/basictypes.h" |
| 12 | #include "base/compiler_specific.h" |
Vitaly Buka | 8750b27 | 2015-08-18 18:39:08 -0700 | [diff] [blame] | 13 | #include "base/gtest_prod_util.h" |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 14 | #include "base/json/json_reader.h" |
| 15 | #include "base/strings/string_piece.h" |
| 16 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 17 | namespace base { |
| 18 | class Value; |
| 19 | } |
| 20 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 21 | namespace base { |
| 22 | namespace internal { |
| 23 | |
| 24 | class JSONParserTest; |
| 25 | |
| 26 | // The implementation behind the JSONReader interface. This class is not meant |
| 27 | // to be used directly; it encapsulates logic that need not be exposed publicly. |
| 28 | // |
| 29 | // This parser guarantees O(n) time through the input string. It also optimizes |
| 30 | // base::StringValue by using StringPiece where possible when returning Value |
| 31 | // objects by using "hidden roots," discussed in the implementation. |
| 32 | // |
| 33 | // Iteration happens on the byte level, with the functions CanConsume and |
| 34 | // NextChar. The conversion from byte to JSON token happens without advancing |
| 35 | // the parser in GetNextToken/ParseToken, that is tokenization operates on |
| 36 | // the current parser position without advancing. |
| 37 | // |
| 38 | // Built on top of these are a family of Consume functions that iterate |
| 39 | // internally. Invariant: on entry of a Consume function, the parser is wound |
| 40 | // to the first byte of a valid JSON token. On exit, it is on the last byte |
| 41 | // of a token, such that the next iteration of the parser will be at the byte |
| 42 | // immediately following the token, which would likely be the first byte of the |
| 43 | // next token. |
| 44 | class BASE_EXPORT_PRIVATE JSONParser { |
| 45 | public: |
| 46 | explicit JSONParser(int options); |
| 47 | ~JSONParser(); |
| 48 | |
| 49 | // Parses the input string according to the set options and returns the |
| 50 | // result as a Value owned by the caller. |
| 51 | Value* Parse(const StringPiece& input); |
| 52 | |
| 53 | // Returns the error code. |
| 54 | JSONReader::JsonParseError error_code() const; |
| 55 | |
| 56 | // Returns the human-friendly error message. |
| 57 | std::string GetErrorMessage() const; |
| 58 | |
| 59 | private: |
| 60 | enum Token { |
| 61 | T_OBJECT_BEGIN, // { |
| 62 | T_OBJECT_END, // } |
| 63 | T_ARRAY_BEGIN, // [ |
| 64 | T_ARRAY_END, // ] |
| 65 | T_STRING, |
| 66 | T_NUMBER, |
| 67 | T_BOOL_TRUE, // true |
| 68 | T_BOOL_FALSE, // false |
| 69 | T_NULL, // null |
| 70 | T_LIST_SEPARATOR, // , |
| 71 | T_OBJECT_PAIR_SEPARATOR, // : |
| 72 | T_END_OF_INPUT, |
| 73 | T_INVALID_TOKEN, |
| 74 | }; |
| 75 | |
| 76 | // A helper class used for parsing strings. One optimization performed is to |
| 77 | // create base::Value with a StringPiece to avoid unnecessary std::string |
| 78 | // copies. This is not possible if the input string needs to be decoded from |
| 79 | // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped. |
| 80 | // This class centralizes that logic. |
| 81 | class StringBuilder { |
| 82 | public: |
| 83 | // Empty constructor. Used for creating a builder with which to Swap(). |
| 84 | StringBuilder(); |
| 85 | |
| 86 | // |pos| is the beginning of an input string, excluding the |"|. |
| 87 | explicit StringBuilder(const char* pos); |
| 88 | |
| 89 | ~StringBuilder(); |
| 90 | |
| 91 | // Swaps the contents of |other| with this. |
| 92 | void Swap(StringBuilder* other); |
| 93 | |
| 94 | // Either increases the |length_| of the string or copies the character if |
| 95 | // the StringBuilder has been converted. |c| must be in the basic ASCII |
| 96 | // plane; all other characters need to be in UTF-8 units, appended with |
| 97 | // AppendString below. |
| 98 | void Append(const char& c); |
| 99 | |
| 100 | // Appends a string to the std::string. Must be Convert()ed to use. |
| 101 | void AppendString(const std::string& str); |
| 102 | |
| 103 | // Converts the builder from its default StringPiece to a full std::string, |
| 104 | // performing a copy. Once a builder is converted, it cannot be made a |
| 105 | // StringPiece again. |
| 106 | void Convert(); |
| 107 | |
| 108 | // Returns whether the builder can be converted to a StringPiece. |
| 109 | bool CanBeStringPiece() const; |
| 110 | |
| 111 | // Returns the StringPiece representation. Returns an empty piece if it |
| 112 | // cannot be converted. |
| 113 | StringPiece AsStringPiece(); |
| 114 | |
| 115 | // Returns the builder as a std::string. |
| 116 | const std::string& AsString(); |
| 117 | |
| 118 | private: |
| 119 | // The beginning of the input string. |
| 120 | const char* pos_; |
| 121 | |
| 122 | // Number of bytes in |pos_| that make up the string being built. |
| 123 | size_t length_; |
| 124 | |
| 125 | // The copied string representation. NULL until Convert() is called. |
| 126 | // Strong. scoped_ptr<T> has too much of an overhead here. |
| 127 | std::string* string_; |
| 128 | }; |
| 129 | |
| 130 | // Quick check that the stream has capacity to consume |length| more bytes. |
| 131 | bool CanConsume(int length); |
| 132 | |
| 133 | // The basic way to consume a single character in the stream. Consumes one |
| 134 | // byte of the input stream and returns a pointer to the rest of it. |
| 135 | const char* NextChar(); |
| 136 | |
| 137 | // Performs the equivalent of NextChar N times. |
| 138 | void NextNChars(int n); |
| 139 | |
| 140 | // Skips over whitespace and comments to find the next token in the stream. |
| 141 | // This does not advance the parser for non-whitespace or comment chars. |
| 142 | Token GetNextToken(); |
| 143 | |
| 144 | // Consumes whitespace characters and comments until the next non-that is |
| 145 | // encountered. |
| 146 | void EatWhitespaceAndComments(); |
| 147 | // Helper function that consumes a comment, assuming that the parser is |
| 148 | // currently wound to a '/'. |
| 149 | bool EatComment(); |
| 150 | |
| 151 | // Calls GetNextToken() and then ParseToken(). Caller owns the result. |
| 152 | Value* ParseNextToken(); |
| 153 | |
| 154 | // Takes a token that represents the start of a Value ("a structural token" |
| 155 | // in RFC terms) and consumes it, returning the result as an object the |
| 156 | // caller owns. |
| 157 | Value* ParseToken(Token token); |
| 158 | |
| 159 | // Assuming that the parser is currently wound to '{', this parses a JSON |
| 160 | // object into a DictionaryValue. |
| 161 | Value* ConsumeDictionary(); |
| 162 | |
| 163 | // Assuming that the parser is wound to '[', this parses a JSON list into a |
| 164 | // ListValue. |
| 165 | Value* ConsumeList(); |
| 166 | |
| 167 | // Calls through ConsumeStringRaw and wraps it in a value. |
| 168 | Value* ConsumeString(); |
| 169 | |
| 170 | // Assuming that the parser is wound to a double quote, this parses a string, |
| 171 | // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on |
| 172 | // success and Swap()s the result into |out|. Returns false on failure with |
| 173 | // error information set. |
| 174 | bool ConsumeStringRaw(StringBuilder* out); |
| 175 | // Helper function for ConsumeStringRaw() that consumes the next four or 10 |
| 176 | // bytes (parser is wound to the first character of a HEX sequence, with the |
| 177 | // potential for consuming another \uXXXX for a surrogate). Returns true on |
| 178 | // success and places the UTF8 code units in |dest_string|, and false on |
| 179 | // failure. |
| 180 | bool DecodeUTF16(std::string* dest_string); |
| 181 | // Helper function for ConsumeStringRaw() that takes a single code point, |
| 182 | // decodes it into UTF-8 units, and appends it to the given builder. The |
| 183 | // point must be valid. |
| 184 | void DecodeUTF8(const int32& point, StringBuilder* dest); |
| 185 | |
| 186 | // Assuming that the parser is wound to the start of a valid JSON number, |
| 187 | // this parses and converts it to either an int or double value. |
| 188 | Value* ConsumeNumber(); |
| 189 | // Helper that reads characters that are ints. Returns true if a number was |
| 190 | // read and false on error. |
| 191 | bool ReadInt(bool allow_leading_zeros); |
| 192 | |
| 193 | // Consumes the literal values of |true|, |false|, and |null|, assuming the |
| 194 | // parser is wound to the first character of any of those. |
| 195 | Value* ConsumeLiteral(); |
| 196 | |
| 197 | // Compares two string buffers of a given length. |
| 198 | static bool StringsAreEqual(const char* left, const char* right, size_t len); |
| 199 | |
| 200 | // Sets the error information to |code| at the current column, based on |
| 201 | // |index_| and |index_last_line_|, with an optional positive/negative |
| 202 | // adjustment by |column_adjust|. |
| 203 | void ReportError(JSONReader::JsonParseError code, int column_adjust); |
| 204 | |
| 205 | // Given the line and column number of an error, formats one of the error |
| 206 | // message contants from json_reader.h for human display. |
| 207 | static std::string FormatErrorMessage(int line, int column, |
| 208 | const std::string& description); |
| 209 | |
| 210 | // base::JSONParserOptions that control parsing. |
| 211 | int options_; |
| 212 | |
| 213 | // Pointer to the start of the input data. |
| 214 | const char* start_pos_; |
| 215 | |
| 216 | // Pointer to the current position in the input data. Equivalent to |
| 217 | // |start_pos_ + index_|. |
| 218 | const char* pos_; |
| 219 | |
| 220 | // Pointer to the last character of the input data. |
| 221 | const char* end_pos_; |
| 222 | |
| 223 | // The index in the input stream to which the parser is wound. |
| 224 | int index_; |
| 225 | |
| 226 | // The number of times the parser has recursed (current stack depth). |
| 227 | int stack_depth_; |
| 228 | |
| 229 | // The line number that the parser is at currently. |
| 230 | int line_number_; |
| 231 | |
| 232 | // The last value of |index_| on the previous line. |
| 233 | int index_last_line_; |
| 234 | |
| 235 | // Error information. |
| 236 | JSONReader::JsonParseError error_code_; |
| 237 | int error_line_; |
| 238 | int error_column_; |
| 239 | |
| 240 | friend class JSONParserTest; |
| 241 | FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar); |
| 242 | FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary); |
| 243 | FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList); |
| 244 | FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString); |
| 245 | FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals); |
| 246 | FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers); |
| 247 | FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages); |
| 248 | |
| 249 | DISALLOW_COPY_AND_ASSIGN(JSONParser); |
| 250 | }; |
| 251 | |
| 252 | } // namespace internal |
| 253 | } // namespace base |
| 254 | |
| 255 | #endif // BASE_JSON_JSON_PARSER_H_ |