Fixing a couple of bugs and improving code readability of UTF8/UTF16 conversions.

stgates · stgates · commit e47002595dd3 · 2015-05-06T13:00:51.000-07:00
diff --git a/Release/src/utilities/asyncrt_utils.cpp b/Release/src/utilities/asyncrt_utils.cpp
@@ -252,7 +252,20 @@ const std::error_category & __cdecl linux_category()
 
 }
 
-#define LOWER_6BITS 0x3F
+#define LOW_3BITS 0x7
+#define LOW_4BITS 0xF
+#define LOW_5BITS 0x1F
+#define LOW_6BITS 0x3F
+#define BIT4 0x8
+#define BIT5 0x10
+#define BIT6 0x20
+#define BIT7 0x40
+#define BIT8 0x80
+#define L_SURROGATE_START 0xDC00
+#define L_SURROGATE_END 0xDFFF
+#define H_SURROGATE_START 0xD800
+#define H_SURROGATE_END 0xDBFF
+#define SURROGATE_PAIR_START 0x10000
 
 utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
 {
@@ -265,58 +278,63 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
     // of the characters are not just ASCII and collapse.
     dest.reserve(static_cast<size_t>(s.size() * .70));
     
-    const unsigned char *src = reinterpret_cast<const unsigned char *>(s.c_str());
-    auto srcRemainingSize = s.size();
-    while (srcRemainingSize > 0)
+    for (auto src = s.begin(); src != s.end(); ++src)
     {
-        if (*src < 0x7F) // single byte character, 0x0 to 0x7F
+        if ((*src & BIT8) == 0) // single byte character, 0x0 to 0x7F
         {
             dest.push_back(utf16string::value_type(*src));
         }
         else
         {
             unsigned char numContBytes = 0;
-            int32_t codePoint;
-            if (*src <= 0xDF) // 2 byte character, 0x80 to 0x7FF
+            uint32_t codePoint;
+            if ((*src & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
             {
-                codePoint = *src & 0x1F;
+                if ((*src & BIT8) != 0 && (*src & BIT7) == 0)
+                {
+                    throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
+                }
+                codePoint = *src & LOW_5BITS;
                 numContBytes = 1;
             }
-            else if (*src <= 0xEF) // 3 byte character, 0x800 to 0xFFFF
+            else if ((*src & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
             {
-                codePoint = *src & 0xF;
+                codePoint = *src & LOW_4BITS;
                 numContBytes = 2;
             }
-            else if (*src <= 0xF7) // 4 byte character, 0x10000 to 0x10FFFF
+            else if ((*src & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
             {
-                codePoint = *src & 0x7;
+                codePoint = *src & LOW_3BITS;
                 numContBytes = 3;
             }
             else
             {
                 throw std::range_error("UTF-8 string has invalid Unicode code point");
             }
-            srcRemainingSize -= numContBytes;
-            if (srcRemainingSize <= 0)
-            {
-                throw std::range_error("UTF-8 string is missing bytes in character");
-            }
 
             for (unsigned char i = 0; i < numContBytes; ++i)
             {
+                if (++src == s.end())
+                {
+                    throw std::range_error("UTF-8 string is missing bytes in character");
+                }
+                if ((*src & BIT8) == 0 || (*src & BIT7) != 0)
+                {
+                    throw std::range_error("UTF-8 continuation byte is missing leading byte");
+                }
                 codePoint <<= 6;
-                codePoint |= *++src & LOWER_6BITS;
+                codePoint |= *src & LOW_6BITS;
             }
 
-            if (numContBytes == 3)
+            if (codePoint >= SURROGATE_PAIR_START)
             {
-                // In UTF-16 U+1000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
+                // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
                 //  - 0x10000 is subtracted from the code point
                 //  - high surrogate is 0xD800 added to the top ten bits
                 //  - low surrogate is 0xDC00 added to the low ten bits
-                codePoint -= 0x10000;
-                dest.push_back(utf16string::value_type((codePoint >> 10) + 0xD800));
-                dest.push_back(utf16string::value_type((codePoint & 0x3FF) + 0xDC00));
+                codePoint -= SURROGATE_PAIR_START;
+                dest.push_back(utf16string::value_type((codePoint >> 10) | H_SURROGATE_START));
+                dest.push_back(utf16string::value_type((codePoint & 0x3FF) | L_SURROGATE_START));
             }
             else
             {
@@ -326,9 +344,6 @@ utf16string __cdecl conversions::utf8_to_utf16(const std::string &s)
                 dest.push_back(utf16string::value_type(codePoint));
             }
         }
-
-        --srcRemainingSize;
-        ++src;
     }
     return dest;
 #endif
@@ -342,59 +357,56 @@ std::string __cdecl conversions::utf16_to_utf8(const utf16string &w)
  #else
     std::string dest;
     dest.reserve(w.size());
-    const utf16string::value_type *src = w.c_str();
-    auto srcRemainingSize = w.size();
-    while (srcRemainingSize > 0)
+    for (auto src = w.begin(); src != w.end(); ++src)
     {
         // Check for high surrogate.
-        if (*src >= 0xD800 && *src <= 0xDBFF)
+        if (*src >= H_SURROGATE_START && *src <= H_SURROGATE_END)
         {
-            if (--srcRemainingSize == 0)
+            const auto highSurrogate = *src;
+            if (++src == w.end())
             {
                 throw std::range_error("UTF-16 string is missing low surrogate");
             }
+            const auto lowSurrogate = *src;
+            if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
+            {
+                throw std::range_error("UTF-16 string has invalid low surrogate");
+            }
 
             // To get from surrogate pair to Unicode code point:
             // - subract 0xD800 from high surrogate, this forms top ten bits
             // - subract 0xDC00 from low surrogate, this forms low ten bits
             // - add 0x10000
             // Leaves a code point in U+10000 to U+10FFFF range.
-            uint32_t codePoint = *src - 0xD800;
+            uint32_t codePoint = highSurrogate - H_SURROGATE_START;
             codePoint <<= 10;
-            codePoint += *++src - 0xDC00;
-            codePoint += 0x10000;
-            if (*src < 0xDC00 || *src > 0xDFFF)
-            {
-                throw std::range_error("UTF-16 string has invalid low surrogate");
-            }
+            codePoint |= lowSurrogate - L_SURROGATE_START;
+            codePoint |= SURROGATE_PAIR_START;
 
             // 4 bytes need using 21 bits
             dest.push_back(char(codePoint >> 18) | 0xF0);               // leading 3 bits
-            dest.push_back(((codePoint >> 12) & LOWER_6BITS) | 0x80);   // next 6 bits
-            dest.push_back(((codePoint >> 6) & LOWER_6BITS) | 0x80);    // next 6 bits
-            dest.push_back((codePoint & LOWER_6BITS) | 0x80);           // trailing 6 bits
+            dest.push_back(((codePoint >> 12) & LOW_6BITS) | BIT8);   // next 6 bits
+            dest.push_back(((codePoint >> 6) & LOW_6BITS) | BIT8);    // next 6 bits
+            dest.push_back((codePoint & LOW_6BITS) | BIT8);           // trailing 6 bits
         }
-        else if (*src <= 0xFFFF)
+        else
         {
-            if (*src < 0x7F) // single byte character
+            if (*src <= 0x7F) // single byte character
             {
                 dest.push_back(static_cast<char>(*src));
             }
             else if (*src <= 0x7FF) // 2 bytes needed (11 bits used)
             {
                 dest.push_back(char(*src >> 6) | 0xC0);             // leading 5 bits
-                dest.push_back((*src & LOWER_6BITS) | 0x80);        // trailing 6 bits
+                dest.push_back((*src & LOW_6BITS) | BIT8);        // trailing 6 bits
             }
             else // 3 bytes needed (16 bits used)
             {
                 dest.push_back((*src >> 12) | 0xE0);                // leading 4 bits
-                dest.push_back(((*src >> 6) & LOWER_6BITS) | 0x80); // middle 6 bits
-                dest.push_back((*src & LOWER_6BITS) | 0x80);        // trailing 6 bits
+                dest.push_back(((*src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
+                dest.push_back((*src & LOW_6BITS) | BIT8);        // trailing 6 bits
             }
         }
-
-        --srcRemainingSize;
-        ++src;
     }
 
     return dest;
diff --git a/Release/tests/functional/utils/strings.cpp b/Release/tests/functional/utils/strings.cpp
@@ -63,13 +63,17 @@ TEST(utf16_to_utf8)
 
     // encodes to single byte character
     VERIFY_ARE_EQUAL("ABC987", utility::conversions::utf16_to_utf8(UTF16("ABC987")));
+    utf16string input;
+    input.push_back(0x7F); // last ASCII character
+    auto result = utility::conversions::utf16_to_utf8(input);
+    VERIFY_ARE_EQUAL(0x7F, result[0]);
 
     // encodes to 2 byte character
-    utf16string input;
+    input.clear();
     input.push_back(0x80);
     input.push_back(0x14D);
     input.push_back(0x7FF);
-    auto result = utility::conversions::utf16_to_utf8(input);
+    result = utility::conversions::utf16_to_utf8(input);
 #if defined(__GLIBCXX__)
     VERIFY_ARE_EQUAL(-62, result[0]);
     VERIFY_ARE_EQUAL(-128, result[1]);
@@ -139,53 +143,63 @@ TEST(utf8_to_utf16)
 
     // single byte character
     VERIFY_ARE_EQUAL(UTF16("ABC123"), utility::conversions::utf8_to_utf16("ABC123"));
-
-    // 2 byte character
     std::string input;
-    input.push_back(207u); // 11001111
-    input.push_back(129u); // 10000001
-    input.push_back(198u); // 11000110
-    input.push_back(141u); // 10001101
+    input.push_back(0x7F); // last ASCII character
     auto result = utility::conversions::utf8_to_utf16(input);
+    VERIFY_ARE_EQUAL(0x7F, result[0]);
+
+    // 2 byte character
+    input.clear();
+    // U+80
+    input.push_back(208u); // 11010000
+    input.push_back(128u); // 10000000
+    // U+7FF
+    input.push_back(223u); // 11011111
+    input.push_back(191u); // 10111111
+    result = utility::conversions::utf8_to_utf16(input);
 #if defined(__GLIBCXX__)
-    VERIFY_ARE_EQUAL(961, result[0]);
-    VERIFY_ARE_EQUAL(397, result[1]);
+    VERIFY_ARE_EQUAL(1024, result[0]);
+    VERIFY_ARE_EQUAL(2047, result[1]);
 #else
     VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
 #endif
 
     // 3 byte character
     input.clear();
-    input.push_back(230u); // 11100110
-    input.push_back(141u); // 10001101
-    input.push_back(157u); // 10011101
-    input.push_back(231u); // 11100111
-    input.push_back(143u); // 10001111
-    input.push_back(156u); // 10011100
+    // U+800
+    input.push_back(232u); // 11101000
+    input.push_back(128u); // 10000000
+    input.push_back(128u); // 10000000
+    // U+FFFF
+    input.push_back(239u); // 11101111
+    input.push_back(191u); // 10111111
+    input.push_back(191u); // 10111111
     result = utility::conversions::utf8_to_utf16(input);
 #if defined(__GLIBCXX__)
-    VERIFY_ARE_EQUAL(25437, result[0]);
-    VERIFY_ARE_EQUAL(29660, result[1]);
+    VERIFY_ARE_EQUAL(32768, result[0]);
+    VERIFY_ARE_EQUAL(65535, result[1]);
 #else
     VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
 #endif
 
     // 4 byte character
     input.clear();
-    input.push_back(240u); // 11110000
-    input.push_back(173u); // 10101101
-    input.push_back(157u); // 10011101
+    // U+10000
+    input.push_back(244u); // 11110100
+    input.push_back(128u); // 10000000
+    input.push_back(128u); // 10000000
+    input.push_back(128u); // 10000000
+    // U+10FFFF
+    input.push_back(244u); // 11110100
     input.push_back(143u); // 10001111
-    input.push_back(240u); // 11111000
-    input.push_back(161u); // 10100001
     input.push_back(191u); // 10111111
     input.push_back(191u); // 10111111
     result = utility::conversions::utf8_to_utf16(input);
 #if defined(__GLIBCXX__)
-    VERIFY_ARE_EQUAL(55413, result[0]);
-    VERIFY_ARE_EQUAL(57167, result[1]);
-    VERIFY_ARE_EQUAL(55296, result[2]);
-    VERIFY_ARE_EQUAL(57160, result[3]);
+    VERIFY_ARE_EQUAL(56256, result[0]);
+    VERIFY_ARE_EQUAL(56320, result[1]);
+    VERIFY_ARE_EQUAL(56319, result[2]);
+    VERIFY_ARE_EQUAL(57343, result[3]);
 #else
     VERIFY_ARE_EQUAL(conversion.from_bytes(input), result);
 #endif
@@ -226,6 +240,22 @@ TEST(utf8_to_utf16_errors)
     input.push_back(173u); // 10101101
     input.push_back(157u); // 10011101
     VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
+
+    // continuation byte missing leading 10xxxxxx
+    input.clear();
+    input.push_back(230u); // 11100110
+    input.push_back(141u); // 00001101
+    VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
+    input.clear();
+    input.push_back(230u); // 11100110
+    input.push_back(141u); // 11001101
+    VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
+
+    // invalid for a first character to start with 1xxxxxxx
+    input.clear();
+    input.push_back(128u); // 10000000
+    input.push_back(128u); // 10000000
+    VERIFY_THROWS(utility::conversions::utf8_to_utf16(input), std::range_error);
 }
 
 TEST(latin1_to_utf16)