Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "base/strings/string_util.h" |
| 6 | |
| 7 | #include <math.h> |
| 8 | #include <stdarg.h> |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 9 | #include <stddef.h> |
| 10 | #include <stdint.h> |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 11 | |
| 12 | #include <algorithm> |
| 13 | |
Vitaly Buka | 8750b27 | 2015-08-18 18:39:08 -0700 | [diff] [blame] | 14 | #include <gmock/gmock.h> |
| 15 | #include <gtest/gtest.h> |
| 16 | |
Alex Vakulenko | 674f0eb | 2016-01-20 08:10:48 -0800 | [diff] [blame] | 17 | #include "base/macros.h" |
Vitaly Buka | 8750b27 | 2015-08-18 18:39:08 -0700 | [diff] [blame] | 18 | #include "base/strings/utf_string_conversion_utils.h" |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 19 | |
| 20 | using ::testing::ElementsAre; |
| 21 | |
| 22 | namespace base { |
| 23 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 24 | TEST(StringUtilTest, IsStringUTF8) { |
| 25 | EXPECT_TRUE(IsStringUTF8("abc")); |
| 26 | EXPECT_TRUE(IsStringUTF8("\xc2\x81")); |
| 27 | EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf")); |
| 28 | EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf")); |
| 29 | EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf")); |
| 30 | EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM |
| 31 | |
| 32 | // surrogate code points |
| 33 | EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf")); |
| 34 | EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f")); |
| 35 | EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf")); |
| 36 | |
| 37 | // overlong sequences |
| 38 | EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000 |
| 39 | EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB" |
| 40 | EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000 |
| 41 | EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080 |
| 42 | EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff |
| 43 | EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D |
| 44 | EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091 |
| 45 | EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800 |
| 46 | EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM) |
| 47 | EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F |
| 48 | EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5 |
| 49 | |
| 50 | // Beyond U+10FFFF (the upper limit of Unicode codespace) |
| 51 | EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000 |
| 52 | EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes |
| 53 | EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes |
| 54 | |
| 55 | // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE) |
| 56 | EXPECT_FALSE(IsStringUTF8("\xfe\xff")); |
| 57 | EXPECT_FALSE(IsStringUTF8("\xff\xfe")); |
| 58 | EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4))); |
| 59 | EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00")); |
| 60 | |
| 61 | // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> |
| 62 | EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE) |
| 63 | EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE |
| 64 | EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF |
| 65 | EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0 |
| 66 | EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF |
| 67 | // Strings in legacy encodings. We can certainly make up strings |
| 68 | // in a legacy encoding that are valid in UTF-8, but in real data, |
| 69 | // most of them are invalid as UTF-8. |
| 70 | EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1 |
| 71 | EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR |
| 72 | EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5 |
| 73 | // "abc" with U+201[CD] in windows-125[0-8] |
| 74 | EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); |
| 75 | // U+0639 U+064E U+0644 U+064E in ISO-8859-6 |
| 76 | EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); |
| 77 | // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 |
| 78 | EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); |
| 79 | |
| 80 | // Check that we support Embedded Nulls. The first uses the canonical UTF-8 |
| 81 | // representation, and the second uses a 2-byte sequence. The second version |
| 82 | // is invalid UTF-8 since UTF-8 states that the shortest encoding for a |
| 83 | // given codepoint must be used. |
| 84 | static const char kEmbeddedNull[] = "embedded\0null"; |
| 85 | EXPECT_TRUE(IsStringUTF8( |
| 86 | std::string(kEmbeddedNull, sizeof(kEmbeddedNull)))); |
| 87 | EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000")); |
| 88 | } |
| 89 | |
| 90 | TEST(StringUtilTest, IsStringASCII) { |
| 91 | static char char_ascii[] = |
| 92 | "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF"; |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 93 | static std::wstring wchar_ascii( |
| 94 | L"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF"); |
| 95 | |
| 96 | // Test a variety of the fragment start positions and lengths in order to make |
| 97 | // sure that bit masking in IsStringASCII works correctly. |
| 98 | // Also, test that a non-ASCII character will be detected regardless of its |
| 99 | // position inside the string. |
| 100 | { |
| 101 | const size_t string_length = arraysize(char_ascii) - 1; |
| 102 | for (size_t offset = 0; offset < 8; ++offset) { |
| 103 | for (size_t len = 0, max_len = string_length - offset; len < max_len; |
| 104 | ++len) { |
| 105 | EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len))); |
| 106 | for (size_t char_pos = offset; char_pos < len; ++char_pos) { |
| 107 | char_ascii[char_pos] |= '\x80'; |
| 108 | EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len))); |
| 109 | char_ascii[char_pos] &= ~'\x80'; |
| 110 | } |
| 111 | } |
| 112 | } |
| 113 | } |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 114 | } |
| 115 | |
| 116 | TEST(StringUtilTest, ReplaceChars) { |
| 117 | struct TestData { |
| 118 | const char* input; |
| 119 | const char* replace_chars; |
| 120 | const char* replace_with; |
| 121 | const char* output; |
| 122 | bool result; |
| 123 | } cases[] = { |
| 124 | { "", "", "", "", false }, |
| 125 | { "test", "", "", "test", false }, |
| 126 | { "test", "", "!", "test", false }, |
| 127 | { "test", "z", "!", "test", false }, |
| 128 | { "test", "e", "!", "t!st", true }, |
| 129 | { "test", "e", "!?", "t!?st", true }, |
| 130 | { "test", "ez", "!", "t!st", true }, |
| 131 | { "test", "zed", "!?", "t!?st", true }, |
| 132 | { "test", "t", "!?", "!?es!?", true }, |
| 133 | { "test", "et", "!>", "!>!>s!>", true }, |
| 134 | { "test", "zest", "!", "!!!!", true }, |
| 135 | { "test", "szt", "!", "!e!!", true }, |
| 136 | { "test", "t", "test", "testestest", true }, |
| 137 | }; |
| 138 | |
| 139 | for (size_t i = 0; i < arraysize(cases); ++i) { |
| 140 | std::string output; |
| 141 | bool result = ReplaceChars(cases[i].input, |
| 142 | cases[i].replace_chars, |
| 143 | cases[i].replace_with, |
| 144 | &output); |
| 145 | EXPECT_EQ(cases[i].result, result); |
| 146 | EXPECT_EQ(cases[i].output, output); |
| 147 | } |
| 148 | } |
| 149 | |
Vitaly Buka | cbed206 | 2015-08-17 12:54:05 -0700 | [diff] [blame] | 150 | } // namespace base |