Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "base/strings/string_util.h" |
| 6 | |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 7 | #include <stdint.h> |
| 8 | #include <limits> |
| 9 | #include "base/macros.h" |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 10 | #include "base/strings/utf_string_conversion_utils.h" |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 11 | #include "base/third_party/icu/icu_utf.h" |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 12 | |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 13 | namespace base { |
| 14 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 15 | namespace { |
| 16 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 17 | typedef uintptr_t MachineWord; |
| 18 | const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1; |
| 19 | |
| 20 | inline bool IsAlignedToMachineWord(const void* pointer) { |
| 21 | return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask); |
| 22 | } |
| 23 | |
| 24 | template<typename T> inline T* AlignToMachineWord(T* pointer) { |
| 25 | return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) & |
| 26 | ~kMachineWordAlignmentMask); |
| 27 | } |
| 28 | |
| 29 | template<size_t size, typename CharacterType> struct NonASCIIMask; |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 30 | template<> struct NonASCIIMask<4, char> { |
| 31 | static inline uint32_t value() { return 0x80808080U; } |
| 32 | }; |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 33 | template<> struct NonASCIIMask<8, char> { |
| 34 | static inline uint64_t value() { return 0x8080808080808080ULL; } |
| 35 | }; |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 36 | |
| 37 | } // namespace |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 38 | namespace { |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 39 | |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 40 | template<typename StringType> |
| 41 | StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) { |
| 42 | StringType ret; |
| 43 | ret.reserve(str.size()); |
| 44 | for (size_t i = 0; i < str.size(); i++) |
| 45 | ret.push_back(ToLowerASCII(str[i])); |
| 46 | return ret; |
| 47 | } |
| 48 | |
| 49 | template<typename StringType> |
| 50 | StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) { |
| 51 | StringType ret; |
| 52 | ret.reserve(str.size()); |
| 53 | for (size_t i = 0; i < str.size(); i++) |
| 54 | ret.push_back(ToUpperASCII(str[i])); |
| 55 | return ret; |
| 56 | } |
| 57 | |
| 58 | } // namespace |
| 59 | |
| 60 | std::string ToLowerASCII(StringPiece str) { |
| 61 | return ToLowerASCIIImpl<std::string>(str); |
| 62 | } |
| 63 | |
| 64 | std::string ToUpperASCII(StringPiece str) { |
| 65 | return ToUpperASCIIImpl<std::string>(str); |
| 66 | } |
| 67 | |
| 68 | template<class StringType> |
| 69 | int CompareCaseInsensitiveASCIIT(BasicStringPiece<StringType> a, |
| 70 | BasicStringPiece<StringType> b) { |
| 71 | // Find the first characters that aren't equal and compare them. If the end |
| 72 | // of one of the strings is found before a nonequal character, the lengths |
| 73 | // of the strings are compared. |
| 74 | size_t i = 0; |
| 75 | while (i < a.length() && i < b.length()) { |
| 76 | typename StringType::value_type lower_a = ToLowerASCII(a[i]); |
| 77 | typename StringType::value_type lower_b = ToLowerASCII(b[i]); |
| 78 | if (lower_a < lower_b) |
| 79 | return -1; |
| 80 | if (lower_a > lower_b) |
| 81 | return 1; |
| 82 | i++; |
| 83 | } |
| 84 | |
| 85 | // End of one string hit before finding a different character. Expect the |
| 86 | // common case to be "strings equal" at this point so check that first. |
| 87 | if (a.length() == b.length()) |
| 88 | return 0; |
| 89 | |
| 90 | if (a.length() < b.length()) |
| 91 | return -1; |
| 92 | return 1; |
| 93 | } |
| 94 | |
| 95 | int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b) { |
| 96 | return CompareCaseInsensitiveASCIIT<std::string>(a, b); |
| 97 | } |
| 98 | |
| 99 | bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b) { |
| 100 | if (a.length() != b.length()) |
| 101 | return false; |
| 102 | return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0; |
| 103 | } |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 104 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 105 | template<typename STR> |
| 106 | bool ReplaceCharsT(const STR& input, |
| 107 | const STR& replace_chars, |
| 108 | const STR& replace_with, |
| 109 | STR* output) { |
| 110 | bool removed = false; |
| 111 | size_t replace_length = replace_with.length(); |
| 112 | |
| 113 | *output = input; |
| 114 | |
| 115 | size_t found = output->find_first_of(replace_chars); |
| 116 | while (found != STR::npos) { |
| 117 | removed = true; |
| 118 | output->replace(found, 1, replace_with); |
| 119 | found = output->find_first_of(replace_chars, found + replace_length); |
| 120 | } |
| 121 | |
| 122 | return removed; |
| 123 | } |
| 124 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 125 | bool ReplaceChars(const std::string& input, |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 126 | const StringPiece& replace_chars, |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 127 | const std::string& replace_with, |
| 128 | std::string* output) { |
| 129 | return ReplaceCharsT(input, replace_chars.as_string(), replace_with, output); |
| 130 | } |
| 131 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 132 | template<typename Str> |
| 133 | TrimPositions TrimStringT(const Str& input, |
| 134 | BasicStringPiece<Str> trim_chars, |
| 135 | TrimPositions positions, |
| 136 | Str* output) { |
| 137 | // Find the edges of leading/trailing whitespace as desired. Need to use |
| 138 | // a StringPiece version of input to be able to call find* on it with the |
| 139 | // StringPiece version of trim_chars (normally the trim_chars will be a |
| 140 | // constant so avoid making a copy). |
| 141 | BasicStringPiece<Str> input_piece(input); |
| 142 | const size_t last_char = input.length() - 1; |
| 143 | const size_t first_good_char = (positions & TRIM_LEADING) ? |
| 144 | input_piece.find_first_not_of(trim_chars) : 0; |
| 145 | const size_t last_good_char = (positions & TRIM_TRAILING) ? |
| 146 | input_piece.find_last_not_of(trim_chars) : last_char; |
| 147 | |
| 148 | // When the string was all trimmed, report that we stripped off characters |
| 149 | // from whichever position the caller was interested in. For empty input, we |
| 150 | // stripped no characters, but we still need to clear |output|. |
| 151 | if (input.empty() || |
| 152 | (first_good_char == Str::npos) || (last_good_char == Str::npos)) { |
| 153 | bool input_was_empty = input.empty(); // in case output == &input |
| 154 | output->clear(); |
| 155 | return input_was_empty ? TRIM_NONE : positions; |
| 156 | } |
| 157 | |
| 158 | // Trim. |
| 159 | *output = |
| 160 | input.substr(first_good_char, last_good_char - first_good_char + 1); |
| 161 | |
| 162 | // Return where we trimmed from. |
| 163 | return static_cast<TrimPositions>( |
| 164 | ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | |
| 165 | ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); |
| 166 | } |
| 167 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 168 | bool TrimString(const std::string& input, |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 169 | StringPiece trim_chars, |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 170 | std::string* output) { |
| 171 | return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; |
| 172 | } |
| 173 | |
| 174 | template<typename Str> |
| 175 | BasicStringPiece<Str> TrimStringPieceT(BasicStringPiece<Str> input, |
| 176 | BasicStringPiece<Str> trim_chars, |
| 177 | TrimPositions positions) { |
| 178 | size_t begin = (positions & TRIM_LEADING) ? |
| 179 | input.find_first_not_of(trim_chars) : 0; |
| 180 | size_t end = (positions & TRIM_TRAILING) ? |
| 181 | input.find_last_not_of(trim_chars) + 1 : input.size(); |
| 182 | return input.substr(begin, end - begin); |
| 183 | } |
| 184 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 185 | StringPiece TrimString(StringPiece input, |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 186 | const StringPiece& trim_chars, |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 187 | TrimPositions positions) { |
| 188 | return TrimStringPieceT(input, trim_chars, positions); |
| 189 | } |
| 190 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 191 | TrimPositions TrimWhitespaceASCII(const std::string& input, |
| 192 | TrimPositions positions, |
| 193 | std::string* output) { |
| 194 | return TrimStringT(input, StringPiece(kWhitespaceASCII), positions, output); |
| 195 | } |
| 196 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 197 | template <class Char> |
| 198 | inline bool DoIsStringASCII(const Char* characters, size_t length) { |
| 199 | MachineWord all_char_bits = 0; |
| 200 | const Char* end = characters + length; |
| 201 | |
| 202 | // Prologue: align the input. |
| 203 | while (!IsAlignedToMachineWord(characters) && characters != end) { |
| 204 | all_char_bits |= *characters; |
| 205 | ++characters; |
| 206 | } |
| 207 | |
| 208 | // Compare the values of CPU word size. |
| 209 | const Char* word_end = AlignToMachineWord(end); |
| 210 | const size_t loop_increment = sizeof(MachineWord) / sizeof(Char); |
| 211 | while (characters < word_end) { |
| 212 | all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); |
| 213 | characters += loop_increment; |
| 214 | } |
| 215 | |
| 216 | // Process the remaining bytes. |
| 217 | while (characters != end) { |
| 218 | all_char_bits |= *characters; |
| 219 | ++characters; |
| 220 | } |
| 221 | |
| 222 | MachineWord non_ascii_bit_mask = |
| 223 | NonASCIIMask<sizeof(MachineWord), Char>::value(); |
| 224 | return !(all_char_bits & non_ascii_bit_mask); |
| 225 | } |
| 226 | |
| 227 | bool IsStringASCII(const StringPiece& str) { |
| 228 | return DoIsStringASCII(str.data(), str.length()); |
| 229 | } |
| 230 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 231 | bool IsStringUTF8(const StringPiece& str) { |
| 232 | const char *src = str.data(); |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 233 | int32_t src_len = static_cast<int32_t>(str.length()); |
| 234 | int32_t char_index = 0; |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 235 | |
| 236 | while (char_index < src_len) { |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 237 | int32_t code_point; |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 238 | CBU8_NEXT(src, char_index, src_len, code_point); |
| 239 | if (!IsValidCharacter(code_point)) |
| 240 | return false; |
| 241 | } |
| 242 | return true; |
| 243 | } |
| 244 | |
Alex Vakulenko | bf79a9e | 2016-03-28 15:11:43 -0700 | [diff] [blame] | 245 | |
| 246 | template <class string_type> |
| 247 | inline typename string_type::value_type* WriteIntoT(string_type* str, |
| 248 | size_t length_with_null) { |
| 249 | DCHECK_GT(length_with_null, 1u); |
| 250 | str->reserve(length_with_null); |
| 251 | str->resize(length_with_null - 1); |
| 252 | return &((*str)[0]); |
| 253 | } |
| 254 | |
| 255 | char* WriteInto(std::string* str, size_t length_with_null) { |
| 256 | return WriteIntoT(str, length_with_null); |
| 257 | } |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 258 | } // namespace base |