Fix UTF-32 encoder/decoder.

cheatfate · cheatfate · commit a2ae097d88e1 · 2020-10-14T12:25:33.000+03:00
Add tests for UTF-8 to UTF-32 and UTF-32 to UTF-8 encoders.
diff --git a/stew/conio.nim b/stew/conio.nim
@@ -355,7 +355,7 @@ elif defined(posix):
         if maxChars < len(wbuffer):
           wbuffer.setLen(maxChars)
         # Conversion of wide characters sequence to UTF-8 encoded string.
-        let ures = wbuffer.wcharToUtf8()
+        let ures = wbuffer.utf32toUtf8()
         if ures.isOk():
           ok(ures.get())
         else:
diff --git a/stew/utf8.nim b/stew/utf8.nim
@@ -12,7 +12,8 @@ export results
 
 type
   UResult*[T] = Result[T, cstring]
-  Wides* = int16 | uint16 | int32 | uint32
+  Wides32* = int32 | uint32
+  Wides16* = int16 | uint16
   Bytes* = int8 | char | uint8 | byte
 
 const
@@ -206,72 +207,200 @@ proc utf8Substr*[T: Bytes](data: openarray[T],
     inc(k)
   ok(res)
 
-proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A],
+proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A],
                                       output: var openarray[B]): UResult[int] =
-  ## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
-  ##
-  ## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
+  ## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
   var offset = 0
   for item in input:
-    let uitem = uint(item)
     let codepoint =
-      if uitem >= 0xD800'u and uitem <= 0xDBFF'u:
-        0x10000'u + ((uitem - 0xD800'u) shl 10)
-      else:
-        if uitem >= 0xDC00'u and uitem <= 0xDFFF'u:
-          uitem - 0xDC00'u
-        else:
-          uitem
-    if codepoint <= 0x7F'u:
+      block:
+        if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32):
+          # high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
+          return err(ErrorInvalidSequence)
+        elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32):
+          # these codes are intended for process-internal uses, and not a
+          # unicode characters.
+          return err(ErrorInvalidSequence)
+        uint32(item)
+    if codepoint <= 0x7F'u32:
       if len(output) > 0:
         if offset < len(output):
-          output[offset] = cast[B](codepoint and 0x7F'u)
+          output[offset] = cast[B](codepoint and 0x7F'u32)
         else:
           return err(ErrorBufferOverflow)
       inc(offset, 1)
-    elif codepoint <= 0x7FF'u:
+    elif codepoint <= 0x7FF'u32:
       if len(output) > 0:
         if offset + 1 < len(output):
           output[offset + 0] = cast[B](0xC0'u8 or
-                                       byte((codepoint shr 6) and 0x1F'u))
-          output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+                                       byte((codepoint shr 6) and 0x1F'u32))
+          output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
         else:
           return err(ErrorBufferOverflow)
       inc(offset, 2)
-    elif codepoint <= 0xFFFF'u:
+    elif codepoint <= 0xFFFF'u32:
       if len(output) > 0:
         if offset + 2 < len(output):
           output[offset + 0] = cast[B](0xE0'u8 or
-                                       byte((codepoint shr 12) and 0x0F'u))
+                                       byte((codepoint shr 12) and 0x0F'u32))
           output[offset + 1] = cast[B](0x80'u8 or
-                                       byte((codepoint shr 6) and 0x3F'u))
-          output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+                                       byte((codepoint shr 6) and 0x3F'u32))
+          output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
         else:
           return err(ErrorBufferOverflow)
       inc(offset, 3)
-    elif codepoint <= 0x10FFFF'u:
+    elif codepoint <= 0x10FFFF'u32:
       if len(output) > 0:
         if offset + 3 < len(output):
           output[offset + 0] = cast[B](0xF0'u8 or
-                                       byte((codepoint shr 18) and 0x07'u))
+                                       byte((codepoint shr 18) and 0x07'u32))
           output[offset + 1] = cast[B](0x80'u8 or
-                                       byte((codepoint shr 12) and 0x3F'u))
+                                       byte((codepoint shr 12) and 0x3F'u32))
           output[offset + 2] = cast[B](0x80'u8 or
-                                       byte((codepoint shr 6) and 0x3F'u))
-          output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+                                       byte((codepoint shr 6) and 0x3F'u32))
+          output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
         else:
-          return err("")
+          return err(ErrorBufferOverflow)
       inc(offset, 4)
     else:
       return err(ErrorInvalidSequence)
   ok(offset)
 
-proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} =
-  ## Converts wide character
+proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} =
+  ## Converts wide character sequence ``input`` to UTF-8 encoded string.
   var empty: array[0, char]
-  let size = ? wcharToUtf8(input, empty)
+  let size = ? utf32ToUtf8(input, empty)
   var output = newString(size)
-  let res {.used.} = ? wcharToUtf8(input, output)
+  let res {.used.} = ? utf32ToUtf8(input, output)
+  ok(output)
+
+proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A],
+                                       output: var openarray[B]): UResult[int] =
+  ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
+  ## sequences of 32bit limbs.
+  ##
+  ## To obtain required size of ``output`` you need to pass ``output`` as
+  ## zero-length array, in such way required size will be returned as result of
+  ## procedure.
+  ##
+  ## If size of ``output`` is not zero, and there not enough space in ``output``
+  ## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
+  ## be returned.
+  var index = 0
+  var dindex = 0
+  if len(output) == 0:
+    return utf8Length(input)
+  else:
+    while true:
+      if index >= len(input):
+        break
+      let byte1 = uint32(input[index])
+      inc(index)
+
+      if (byte1 and 0x80) == 0x00:
+        if dindex < len(output):
+          output[dindex] = B(byte1)
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+      elif (byte1 and 0xE0'u32) == 0xC0'u32:
+        # Two-byte form (110xxxxx 10xxxxxx)
+        if index >= len(input):
+          return err(ErrorInvalidSequence)
+        # overlong sequence test
+        if (byte1 and 0xFE'u32) == 0xC0'u32:
+          return err(ErrorInvalidSequence)
+
+        let byte2 = uint32(input[index])
+        if (byte2 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+
+        if dindex < len(output):
+          output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or
+                              (byte2 and 0x3F'u32))
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+        inc(index)
+      elif (byte1 and 0xF0'u32) == 0xE0'u32:
+        # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
+        if (index + 1) >= len(input):
+          return err(ErrorInvalidSequence)
+
+        let byte2 = uint32(input[index])
+        if (byte2 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+        # overlong sequence test
+        if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32):
+          return err(ErrorInvalidSequence)
+        #  0xD800–0xDFFF (UTF-16 surrogates) test
+        if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32):
+          return err(ErrorInvalidSequence)
+
+        let byte3 = uint32(input[index + 1])
+        if (byte3 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+        # U+FFFE or U+FFFF test
+        if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and
+           ((byte3 and 0xFE'u32) == 0xBE'u32):
+          return err(ErrorInvalidSequence)
+
+        if dindex < len(output):
+          output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or
+                             ((byte2 and 0x3F'u32) shl 6) or
+                              (byte3 and 0x3F'u32))
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+        inc(index, 2)
+
+      elif (byte1 and 0xF8'u8) == 0xF0'u8:
+        # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+        if (index + 2) >= len(input):
+          return err(ErrorInvalidSequence)
+
+        let byte2 = uint32(input[index])
+        if (byte2 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+        # overlong sequence test
+        if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32):
+          return err(ErrorInvalidSequence)
+        # According to RFC 3629 no point above U+10FFFF should be used, which
+        # limits characters to four bytes.
+        if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32):
+          return err(ErrorInvalidSequence)
+
+        let byte3 = uint32(input[index + 1])
+        if (byte3 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+
+        let byte4 = uint32(input[index + 2])
+        if (byte4 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+
+        if dindex < len(output):
+          output[dindex] = B(((byte1 and 0x07'u32) shl 18) or
+                             ((byte2 and 0x3F'u32) shl 12) or
+                             ((byte3 and 0x3F'u32) shl 6) or
+                              (byte4 and 0x3F'u32))
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+        inc(index, 3)
+
+      else:
+        return err(ErrorInvalidSequence)
+
+    ok(dindex)
+
+proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B],
+                                        input: openarray[A]): UResult[seq[B]] =
+  ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
+  ## sequence of 32bit limbs and return it.
+  var empty: array[0, B]
+  let size = ? utf8toUtf32(input, empty)
+  var output = newSeq[B](size)
+  let res {.used.} = ? utf8toUtf32(input, output)
   ok(output)
 
 when defined(posix):
diff --git a/tests/test_utf8.nim b/tests/test_utf8.nim
@@ -302,18 +302,40 @@ suite "UTF-8 validation test suite":
       utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == ""
       utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == ""
 
-  test "wcharToUtf8() tests":
+  test "UTF-32 -> UTF-8 conversion test":
     for i in 0 ..< 0x11_0000:
-      if i != 0xFFFE and i != 0xFFFF:
-        if i < 0x10000:
-          var data16 = [uint16(i)]
-          let res = wcharToUtf8(data16)
-          check:
-            res.isOk() == true
-            utf8Validate(res.get()) == true
-
+      var data32 = [uint32(i)]
+      if i >= 0xD800 and i <= 0xDFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFE:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0x11_0000:
+        check utf32toUtf8(data32).isErr()
+      else:
         var data32 = [uint32(i)]
-        let res = wcharToUtf8(data32)
+        let res = utf32toUtf8(data32)
         check:
           res.isOk() == true
           utf8Validate(res.get()) == true
+
+  test "UTF-8 -> UTF-32 conversion test":
+    for i in 0 ..< 0x11_0001:
+      var data32 = [uint32(i)]
+      if i >= 0xD800 and i <= 0xDFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFE:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0x11_0000:
+        check utf32toUtf8(data32).isErr()
+      else:
+        var data32 = [uint32(i)]
+        let res8 = utf32toUtf8(data32)
+        check res8.isOk()
+        let res32 = utf8toUtf32(uint32, res8.get())
+        check:
+          res32.isOk()
+          res32.get() == data32