Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | // |
| 5 | // This file defines utility functions for working with strings. |
| 6 | |
| 7 | #ifndef BASE_STRINGS_STRING_UTIL_H_ |
| 8 | #define BASE_STRINGS_STRING_UTIL_H_ |
| 9 | |
| 10 | #include <ctype.h> |
| 11 | #include <stdarg.h> // va_list |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 12 | #include <stddef.h> |
| 13 | #include <stdint.h> |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 14 | |
| 15 | #include <string> |
| 16 | #include <vector> |
| 17 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 18 | #include "base/compiler_specific.h" |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 19 | #include "base/strings/string_piece.h" // For implicit conversions. |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 20 | #include "build/build_config.h" |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 21 | |
| 22 | // On Android, bionic's stdio.h defines an snprintf macro when being built with |
| 23 | // clang. Undefine it here so it won't collide with base::snprintf(). |
| 24 | #undef snprintf |
| 25 | |
| 26 | namespace base { |
| 27 | |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 28 | // C standard-library functions that aren't cross-platform are provided as |
| 29 | // "base::...", and their prototypes are listed below. These functions are |
| 30 | // then implemented as inline calls to the platform-specific equivalents in the |
| 31 | // platform-specific headers. |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 32 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 33 | // Wrapper for vsnprintf that always null-terminates and always returns the |
| 34 | // number of characters that would be in an untruncated formatted |
| 35 | // string, even when truncation occurs. |
| 36 | int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments) |
| 37 | PRINTF_FORMAT(3, 0); |
| 38 | |
| 39 | // Some of these implementations need to be inlined. |
| 40 | |
| 41 | // We separate the declaration from the implementation of this inline |
| 42 | // function just so the PRINTF_FORMAT works. |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 43 | inline int snprintf(char* buffer, |
| 44 | size_t size, |
| 45 | _Printf_format_string_ const char* format, |
| 46 | ...) PRINTF_FORMAT(3, 4); |
| 47 | inline int snprintf(char* buffer, |
| 48 | size_t size, |
| 49 | _Printf_format_string_ const char* format, |
| 50 | ...) { |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 51 | va_list arguments; |
| 52 | va_start(arguments, format); |
| 53 | int result = vsnprintf(buffer, size, format, arguments); |
| 54 | va_end(arguments); |
| 55 | return result; |
| 56 | } |
| 57 | |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 58 | // BSD-style safe and consistent string copy functions. |
| 59 | // Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. |
| 60 | // Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as |
| 61 | // long as |dst_size| is not 0. Returns the length of |src| in characters. |
| 62 | // If the return value is >= dst_size, then the output was truncated. |
| 63 | // NOTE: All sizes are in number of characters, NOT in bytes. |
| 64 | size_t strlcpy(char* dst, const char* src, size_t dst_size); |
| 65 | |
| 66 | // ASCII-specific tolower. The standard library's tolower is locale sensitive, |
| 67 | // so we don't want to use it here. |
| 68 | inline char ToLowerASCII(char c) { |
| 69 | return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; |
| 70 | } |
| 71 | |
| 72 | // ASCII-specific toupper. The standard library's toupper is locale sensitive, |
| 73 | // so we don't want to use it here. |
| 74 | inline char ToUpperASCII(char c) { |
| 75 | return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; |
| 76 | } |
| 77 | // Converts the given string to it's ASCII-lowercase equivalent. |
| 78 | std::string ToLowerASCII(StringPiece str); |
| 79 | // Converts the given string to it's ASCII-uppercase equivalent. |
| 80 | std::string ToUpperASCII(StringPiece str); |
| 81 | |
| 82 | // Functor for case-insensitive ASCII comparisons for STL algorithms like |
| 83 | // std::search. |
| 84 | // |
| 85 | // Note that a full Unicode version of this functor is not possible to write |
| 86 | // because case mappings might change the number of characters, depend on |
| 87 | // context (combining accents), and require handling UTF-16. If you need |
| 88 | // proper Unicode support, use base::i18n::ToLower/FoldCase and then just |
| 89 | // use a normal operator== on the result. |
| 90 | template<typename Char> struct CaseInsensitiveCompareASCII { |
| 91 | public: |
| 92 | bool operator()(Char x, Char y) const { |
| 93 | return ToLowerASCII(x) == ToLowerASCII(y); |
| 94 | } |
| 95 | }; |
| 96 | |
| 97 | // Like strcasecmp for case-insensitive ASCII characters only. Returns: |
| 98 | // -1 (a < b) |
| 99 | // 0 (a == b) |
| 100 | // 1 (a > b) |
| 101 | // (unlike strcasecmp which can return values greater or less than 1/-1). For |
| 102 | // full Unicode support, use base::i18n::ToLower or base::i18h::FoldCase |
| 103 | // and then just call the normal string operators on the result. |
| 104 | int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b); |
| 105 | |
| 106 | // Equality for ASCII case-insensitive comparisons. For full Unicode support, |
| 107 | // use base::i18n::ToLower or base::i18h::FoldCase and then compare with either |
| 108 | // == or !=. |
| 109 | bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b); |
| 110 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 111 | // Contains the set of characters representing whitespace in the corresponding |
| 112 | // encoding. Null-terminated. The ASCII versions are the whitespaces as defined |
| 113 | // by HTML5, and don't include control characters. |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 114 | extern const char kWhitespaceASCII[]; |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 115 | |
| 116 | // Replaces characters in |replace_chars| from anywhere in |input| with |
| 117 | // |replace_with|. Each character in |replace_chars| will be replaced with |
| 118 | // the |replace_with| string. Returns true if any characters were replaced. |
| 119 | // |replace_chars| must be null-terminated. |
| 120 | // NOTE: Safe to use the same variable for both |input| and |output|. |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 121 | bool ReplaceChars(const std::string& input, |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 122 | const StringPiece& replace_chars, |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 123 | const std::string& replace_with, |
| 124 | std::string* output); |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 125 | |
| 126 | enum TrimPositions { |
| 127 | TRIM_NONE = 0, |
| 128 | TRIM_LEADING = 1 << 0, |
| 129 | TRIM_TRAILING = 1 << 1, |
| 130 | TRIM_ALL = TRIM_LEADING | TRIM_TRAILING, |
| 131 | }; |
| 132 | |
| 133 | // Removes characters in |trim_chars| from the beginning and end of |input|. |
| 134 | // The 8-bit version only works on 8-bit characters, not UTF-8. |
| 135 | // |
| 136 | // It is safe to use the same variable for both |input| and |output| (this is |
| 137 | // the normal usage to trim in-place). |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 138 | bool TrimString(const std::string& input, |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 139 | StringPiece trim_chars, |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 140 | std::string* output); |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 141 | |
| 142 | // StringPiece versions of the above. The returned pieces refer to the original |
| 143 | // buffer. |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 144 | StringPiece TrimString(StringPiece input, |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 145 | const StringPiece& trim_chars, |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 146 | TrimPositions positions); |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 147 | |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 148 | // Trims any whitespace from either end of the input string. |
| 149 | // |
| 150 | // The StringPiece versions return a substring referencing the input buffer. |
| 151 | // The ASCII versions look only for ASCII whitespace. |
| 152 | // |
| 153 | // The std::string versions return where whitespace was found. |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 154 | // NOTE: Safe to use the same variable for both input and output. |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 155 | TrimPositions TrimWhitespaceASCII(const std::string& input, |
| 156 | TrimPositions positions, |
| 157 | std::string* output); |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 158 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 159 | // Returns true if the specified string matches the criteria. How can a wide |
| 160 | // string be 8-bit or UTF8? It contains only characters that are < 256 (in the |
| 161 | // first case) or characters that use only 8-bits and whose 8-bit |
| 162 | // representation looks like a UTF-8 string (the second case). |
| 163 | // |
| 164 | // Note that IsStringUTF8 checks not only if the input is structurally |
| 165 | // valid but also if it doesn't contain any non-character codepoint |
| 166 | // (e.g. U+FFFE). It's done on purpose because all the existing callers want |
| 167 | // to have the maximum 'discriminating' power from other encodings. If |
| 168 | // there's a use case for just checking the structural validity, we have to |
| 169 | // add a new function for that. |
| 170 | // |
| 171 | // IsStringASCII assumes the input is likely all ASCII, and does not leave early |
| 172 | // if it is not the case. |
Vitaly Buka | 60b8f00 | 2015-08-20 13:47:48 -0700 | [diff] [blame] | 173 | bool IsStringUTF8(const StringPiece& str); |
| 174 | bool IsStringASCII(const StringPiece& str); |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 175 | |
Alex Vakulenko | bf79a9e | 2016-03-28 15:11:43 -0700 | [diff] [blame] | 176 | // Reserves enough memory in |str| to accommodate |length_with_null| characters, |
| 177 | // sets the size of |str| to |length_with_null - 1| characters, and returns a |
| 178 | // pointer to the underlying contiguous array of characters. This is typically |
| 179 | // used when calling a function that writes results into a character array, but |
| 180 | // the caller wants the data to be managed by a string-like object. It is |
| 181 | // convenient in that is can be used inline in the call, and fast in that it |
| 182 | // avoids copying the results of the call from a char* into a string. |
| 183 | // |
| 184 | // |length_with_null| must be at least 2, since otherwise the underlying string |
| 185 | // would have size 0, and trying to access &((*str)[0]) in that case can result |
| 186 | // in a number of problems. |
| 187 | // |
| 188 | // Internally, this takes linear time because the resize() call 0-fills the |
| 189 | // underlying array for potentially all |
| 190 | // (|length_with_null - 1| * sizeof(string_type::value_type)) bytes. Ideally we |
| 191 | // could avoid this aspect of the resize() call, as we expect the caller to |
| 192 | // immediately write over this memory, but there is no other way to set the size |
| 193 | // of the string, and not doing that will mean people who access |str| rather |
| 194 | // than str.c_str() will get back a string of whatever size |str| had on entry |
| 195 | // to this function (probably 0). |
| 196 | char* WriteInto(std::string* str, size_t length_with_null); |
| 197 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 198 | } // namespace base |
| 199 | |
| 200 | #if defined(OS_WIN) |
| 201 | #include "base/strings/string_util_win.h" |
| 202 | #elif defined(OS_POSIX) |
| 203 | #include "base/strings/string_util_posix.h" |
| 204 | #else |
| 205 | #error Define string operations appropriately for your platform |
| 206 | #endif |
| 207 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 208 | #endif // BASE_STRINGS_STRING_UTIL_H_ |