third_party/chromium/base/strings/string_util_unittest.cc - weave/libweave - Git at Google

 // Copyright 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "base/strings/string_util.h"

 #include <math.h>
 #include <stdarg.h>

 #include <algorithm>

 #include <gmock/gmock.h>
 #include <gtest/gtest.h>

 #include "base/basictypes.h"
 #include "base/strings/utf_string_conversion_utils.h"

 using ::testing::ElementsAre;

 namespace base {

 TEST(StringUtilTest, IsStringUTF8) {
   EXPECT_TRUE(IsStringUTF8("abc"));
   EXPECT_TRUE(IsStringUTF8("\xc2\x81"));
   EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));
   EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));
   EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));
   EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc"));  // UTF-8 BOM

   // surrogate code points
   EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));
   EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f"));
   EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));

   // overlong sequences
   EXPECT_FALSE(IsStringUTF8("\xc0\x80"));  // U+0000
   EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81"));  // "AB"
   EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80"));  // U+0000
   EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80"));  // U+0080
   EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf"));  // U+07ff
   EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D"));  // U+000D
   EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91"));  // U+0091
   EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80"));  // U+0800
   EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf"));  // U+FEFF (BOM)
   EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf"));  // U+003F
   EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5"));  // U+00A5

   // Beyond U+10FFFF (the upper limit of Unicode codespace)
   EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80"));  // U+110000
   EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf"));  // 5 bytes
   EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80"));  // 6 bytes

   // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)
   EXPECT_FALSE(IsStringUTF8("\xfe\xff"));
   EXPECT_FALSE(IsStringUTF8("\xff\xfe"));
   EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));
   EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));

   // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
   EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe"));  // U+FFFE)
   EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe"));  // U+1FFFE
   EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf"));  // U+10FFFF
   EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90"));  // U+FDD0
   EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf"));  // U+FDEF
   // Strings in legacy encodings. We can certainly make up strings
   // in a legacy encoding that are valid in UTF-8, but in real data,
   // most of them are invalid as UTF-8.
   EXPECT_FALSE(IsStringUTF8("caf\xe9"));  // cafe with U+00E9 in ISO-8859-1
   EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2"));  // U+AC00, U+AC001 in EUC-KR
   EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e"));  // U+4F60 U+597D in Big5
   // "abc" with U+201[CD] in windows-125[0-8]
   EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94"));
   // U+0639 U+064E U+0644 U+064E in ISO-8859-6
   EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee"));
   // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
   EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));

   // Check that we support Embedded Nulls. The first uses the canonical UTF-8
   // representation, and the second uses a 2-byte sequence. The second version
   // is invalid UTF-8 since UTF-8 states that the shortest encoding for a
   // given codepoint must be used.
   static const char kEmbeddedNull[] = "embedded\0null";
   EXPECT_TRUE(IsStringUTF8(
       std::string(kEmbeddedNull, sizeof(kEmbeddedNull))));
   EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000"));
 }

 TEST(StringUtilTest, IsStringASCII) {
   static char char_ascii[] =
       "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
   static std::wstring wchar_ascii(
       L"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF");

   // Test a variety of the fragment start positions and lengths in order to make
   // sure that bit masking in IsStringASCII works correctly.
   // Also, test that a non-ASCII character will be detected regardless of its
   // position inside the string.
   {
     const size_t string_length = arraysize(char_ascii) - 1;
     for (size_t offset = 0; offset < 8; ++offset) {
       for (size_t len = 0, max_len = string_length - offset; len < max_len;
            ++len) {
         EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len)));
         for (size_t char_pos = offset; char_pos < len; ++char_pos) {
           char_ascii[char_pos] |= '\x80';
           EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len)));
           char_ascii[char_pos] &= ~'\x80';
         }
       }
     }
   }
 }

 TEST(StringUtilTest, ReplaceChars) {
   struct TestData {
     const char* input;
     const char* replace_chars;
     const char* replace_with;
     const char* output;
     bool result;
   } cases[] = {
     { "", "", "", "", false },
     { "test", "", "", "test", false },
     { "test", "", "!", "test", false },
     { "test", "z", "!", "test", false },
     { "test", "e", "!", "t!st", true },
     { "test", "e", "!?", "t!?st", true },
     { "test", "ez", "!", "t!st", true },
     { "test", "zed", "!?", "t!?st", true },
     { "test", "t", "!?", "!?es!?", true },
     { "test", "et", "!>", "!>!>s!>", true },
     { "test", "zest", "!", "!!!!", true },
     { "test", "szt", "!", "!e!!", true },
     { "test", "t", "test", "testestest", true },
   };

   for (size_t i = 0; i < arraysize(cases); ++i) {
     std::string output;
     bool result = ReplaceChars(cases[i].input,
                                cases[i].replace_chars,
                                cases[i].replace_with,
                                &output);
     EXPECT_EQ(cases[i].result, result);
     EXPECT_EQ(cases[i].output, output);
   }
 }

 }  // namespace base
	// Copyright 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "base/strings/string_util.h"

	#include <math.h>
	#include <stdarg.h>

	#include <algorithm>

	#include <gmock/gmock.h>
	#include <gtest/gtest.h>

	#include "base/basictypes.h"
	#include "base/strings/utf_string_conversion_utils.h"

	using ::testing::ElementsAre;

	namespace base {

	TEST(StringUtilTest, IsStringUTF8) {
	EXPECT_TRUE(IsStringUTF8("abc"));
	EXPECT_TRUE(IsStringUTF8("\xc2\x81"));
	EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));
	EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));
	EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));
	EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM

	// surrogate code points
	EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));
	EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f"));
	EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));

	// overlong sequences
	EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000
	EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB"
	EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000
	EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080
	EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff
	EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D
	EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091
	EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800
	EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM)
	EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F
	EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5

	// Beyond U+10FFFF (the upper limit of Unicode codespace)
	EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000
	EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes
	EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes

	// BOMs in UTF-16(BE\|LE) and UTF-32(BE\|LE)
	EXPECT_FALSE(IsStringUTF8("\xfe\xff"));
	EXPECT_FALSE(IsStringUTF8("\xff\xfe"));
	EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));
	EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));

	// Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
	EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE)
	EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE
	EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF
	EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0
	EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF
	// Strings in legacy encodings. We can certainly make up strings
	// in a legacy encoding that are valid in UTF-8, but in real data,
	// most of them are invalid as UTF-8.
	EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1
	EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR
	EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5
	// "abc" with U+201[CD] in windows-125[0-8]
	EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94"));
	// U+0639 U+064E U+0644 U+064E in ISO-8859-6
	EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee"));
	// U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
	EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));

	// Check that we support Embedded Nulls. The first uses the canonical UTF-8
	// representation, and the second uses a 2-byte sequence. The second version
	// is invalid UTF-8 since UTF-8 states that the shortest encoding for a
	// given codepoint must be used.
	static const char kEmbeddedNull[] = "embedded\0null";
	EXPECT_TRUE(IsStringUTF8(
	std::string(kEmbeddedNull, sizeof(kEmbeddedNull))));
	EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000"));
	}

	TEST(StringUtilTest, IsStringASCII) {
	static char char_ascii[] =
	"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
	static std::wstring wchar_ascii(
	L"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF");

	// Test a variety of the fragment start positions and lengths in order to make
	// sure that bit masking in IsStringASCII works correctly.
	// Also, test that a non-ASCII character will be detected regardless of its
	// position inside the string.
	{
	const size_t string_length = arraysize(char_ascii) - 1;
	for (size_t offset = 0; offset < 8; ++offset) {
	for (size_t len = 0, max_len = string_length - offset; len < max_len;
	++len) {
	EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len)));
	for (size_t char_pos = offset; char_pos < len; ++char_pos) {
	char_ascii[char_pos] \|= '\x80';
	EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len)));
	char_ascii[char_pos] &= ~'\x80';
	}
	}
	}
	}
	}

	TEST(StringUtilTest, ReplaceChars) {
	struct TestData {
	const char* input;
	const char* replace_chars;
	const char* replace_with;
	const char* output;
	bool result;
	} cases[] = {
	{ "", "", "", "", false },
	{ "test", "", "", "test", false },
	{ "test", "", "!", "test", false },
	{ "test", "z", "!", "test", false },
	{ "test", "e", "!", "t!st", true },
	{ "test", "e", "!?", "t!?st", true },
	{ "test", "ez", "!", "t!st", true },
	{ "test", "zed", "!?", "t!?st", true },
	{ "test", "t", "!?", "!?es!?", true },
	{ "test", "et", "!>", "!>!>s!>", true },
	{ "test", "zest", "!", "!!!!", true },
	{ "test", "szt", "!", "!e!!", true },
	{ "test", "t", "test", "testestest", true },
	};

	for (size_t i = 0; i < arraysize(cases); ++i) {
	std::string output;
	bool result = ReplaceChars(cases[i].input,
	cases[i].replace_chars,
	cases[i].replace_with,
	&output);
	EXPECT_EQ(cases[i].result, result);
	EXPECT_EQ(cases[i].output, output);
	}
	}

	} // namespace base