Skip to content

Commit a2ae097

Browse files
committed
Fix UTF-32 encoder/decoder.
Add tests for UTF-8 to UTF-32 and UTF-32 to UTF-8 encoders.
1 parent 2a36a61 commit a2ae097

File tree

3 files changed

+194
-43
lines changed

3 files changed

+194
-43
lines changed

stew/conio.nim

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ elif defined(posix):
355355
if maxChars < len(wbuffer):
356356
wbuffer.setLen(maxChars)
357357
# Conversion of wide characters sequence to UTF-8 encoded string.
358-
let ures = wbuffer.wcharToUtf8()
358+
let ures = wbuffer.utf32toUtf8()
359359
if ures.isOk():
360360
ok(ures.get())
361361
else:

stew/utf8.nim

Lines changed: 161 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ export results
1212

1313
type
1414
UResult*[T] = Result[T, cstring]
15-
Wides* = int16 | uint16 | int32 | uint32
15+
Wides32* = int32 | uint32
16+
Wides16* = int16 | uint16
1617
Bytes* = int8 | char | uint8 | byte
1718

1819
const
@@ -206,72 +207,200 @@ proc utf8Substr*[T: Bytes](data: openarray[T],
206207
inc(k)
207208
ok(res)
208209

209-
proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A],
210+
proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A],
210211
output: var openarray[B]): UResult[int] =
211-
## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
212-
##
213-
## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
212+
## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
214213
var offset = 0
215214
for item in input:
216-
let uitem = uint(item)
217215
let codepoint =
218-
if uitem >= 0xD800'u and uitem <= 0xDBFF'u:
219-
0x10000'u + ((uitem - 0xD800'u) shl 10)
220-
else:
221-
if uitem >= 0xDC00'u and uitem <= 0xDFFF'u:
222-
uitem - 0xDC00'u
223-
else:
224-
uitem
225-
if codepoint <= 0x7F'u:
216+
block:
217+
if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32):
218+
# high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
219+
return err(ErrorInvalidSequence)
220+
elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32):
221+
# these codes are intended for process-internal uses, and not a
222+
# unicode characters.
223+
return err(ErrorInvalidSequence)
224+
uint32(item)
225+
if codepoint <= 0x7F'u32:
226226
if len(output) > 0:
227227
if offset < len(output):
228-
output[offset] = cast[B](codepoint and 0x7F'u)
228+
output[offset] = cast[B](codepoint and 0x7F'u32)
229229
else:
230230
return err(ErrorBufferOverflow)
231231
inc(offset, 1)
232-
elif codepoint <= 0x7FF'u:
232+
elif codepoint <= 0x7FF'u32:
233233
if len(output) > 0:
234234
if offset + 1 < len(output):
235235
output[offset + 0] = cast[B](0xC0'u8 or
236-
byte((codepoint shr 6) and 0x1F'u))
237-
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
236+
byte((codepoint shr 6) and 0x1F'u32))
237+
output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
238238
else:
239239
return err(ErrorBufferOverflow)
240240
inc(offset, 2)
241-
elif codepoint <= 0xFFFF'u:
241+
elif codepoint <= 0xFFFF'u32:
242242
if len(output) > 0:
243243
if offset + 2 < len(output):
244244
output[offset + 0] = cast[B](0xE0'u8 or
245-
byte((codepoint shr 12) and 0x0F'u))
245+
byte((codepoint shr 12) and 0x0F'u32))
246246
output[offset + 1] = cast[B](0x80'u8 or
247-
byte((codepoint shr 6) and 0x3F'u))
248-
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
247+
byte((codepoint shr 6) and 0x3F'u32))
248+
output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
249249
else:
250250
return err(ErrorBufferOverflow)
251251
inc(offset, 3)
252-
elif codepoint <= 0x10FFFF'u:
252+
elif codepoint <= 0x10FFFF'u32:
253253
if len(output) > 0:
254254
if offset + 3 < len(output):
255255
output[offset + 0] = cast[B](0xF0'u8 or
256-
byte((codepoint shr 18) and 0x07'u))
256+
byte((codepoint shr 18) and 0x07'u32))
257257
output[offset + 1] = cast[B](0x80'u8 or
258-
byte((codepoint shr 12) and 0x3F'u))
258+
byte((codepoint shr 12) and 0x3F'u32))
259259
output[offset + 2] = cast[B](0x80'u8 or
260-
byte((codepoint shr 6) and 0x3F'u))
261-
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
260+
byte((codepoint shr 6) and 0x3F'u32))
261+
output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
262262
else:
263-
return err("")
263+
return err(ErrorBufferOverflow)
264264
inc(offset, 4)
265265
else:
266266
return err(ErrorInvalidSequence)
267267
ok(offset)
268268

269-
proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} =
270-
## Converts wide character
269+
proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} =
270+
## Converts wide character sequence ``input`` to UTF-8 encoded string.
271271
var empty: array[0, char]
272-
let size = ? wcharToUtf8(input, empty)
272+
let size = ? utf32ToUtf8(input, empty)
273273
var output = newString(size)
274-
let res {.used.} = ? wcharToUtf8(input, output)
274+
let res {.used.} = ? utf32ToUtf8(input, output)
275+
ok(output)
276+
277+
proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A],
278+
output: var openarray[B]): UResult[int] =
279+
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
280+
## sequences of 32bit limbs.
281+
##
282+
## To obtain required size of ``output`` you need to pass ``output`` as
283+
## zero-length array, in such way required size will be returned as result of
284+
## procedure.
285+
##
286+
## If size of ``output`` is not zero, and there not enough space in ``output``
287+
## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
288+
## be returned.
289+
var index = 0
290+
var dindex = 0
291+
if len(output) == 0:
292+
return utf8Length(input)
293+
else:
294+
while true:
295+
if index >= len(input):
296+
break
297+
let byte1 = uint32(input[index])
298+
inc(index)
299+
300+
if (byte1 and 0x80) == 0x00:
301+
if dindex < len(output):
302+
output[dindex] = B(byte1)
303+
inc(dindex)
304+
else:
305+
return err(ErrorBufferOverflow)
306+
elif (byte1 and 0xE0'u32) == 0xC0'u32:
307+
# Two-byte form (110xxxxx 10xxxxxx)
308+
if index >= len(input):
309+
return err(ErrorInvalidSequence)
310+
# overlong sequence test
311+
if (byte1 and 0xFE'u32) == 0xC0'u32:
312+
return err(ErrorInvalidSequence)
313+
314+
let byte2 = uint32(input[index])
315+
if (byte2 and 0xC0'u32) != 0x80'u32:
316+
return err(ErrorInvalidSequence)
317+
318+
if dindex < len(output):
319+
output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or
320+
(byte2 and 0x3F'u32))
321+
inc(dindex)
322+
else:
323+
return err(ErrorBufferOverflow)
324+
inc(index)
325+
elif (byte1 and 0xF0'u32) == 0xE0'u32:
326+
# Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
327+
if (index + 1) >= len(input):
328+
return err(ErrorInvalidSequence)
329+
330+
let byte2 = uint32(input[index])
331+
if (byte2 and 0xC0'u32) != 0x80'u32:
332+
return err(ErrorInvalidSequence)
333+
# overlong sequence test
334+
if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32):
335+
return err(ErrorInvalidSequence)
336+
# 0xD800–0xDFFF (UTF-16 surrogates) test
337+
if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32):
338+
return err(ErrorInvalidSequence)
339+
340+
let byte3 = uint32(input[index + 1])
341+
if (byte3 and 0xC0'u32) != 0x80'u32:
342+
return err(ErrorInvalidSequence)
343+
# U+FFFE or U+FFFF test
344+
if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and
345+
((byte3 and 0xFE'u32) == 0xBE'u32):
346+
return err(ErrorInvalidSequence)
347+
348+
if dindex < len(output):
349+
output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or
350+
((byte2 and 0x3F'u32) shl 6) or
351+
(byte3 and 0x3F'u32))
352+
inc(dindex)
353+
else:
354+
return err(ErrorBufferOverflow)
355+
inc(index, 2)
356+
357+
elif (byte1 and 0xF8'u8) == 0xF0'u8:
358+
# Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
359+
if (index + 2) >= len(input):
360+
return err(ErrorInvalidSequence)
361+
362+
let byte2 = uint32(input[index])
363+
if (byte2 and 0xC0'u32) != 0x80'u32:
364+
return err(ErrorInvalidSequence)
365+
# overlong sequence test
366+
if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32):
367+
return err(ErrorInvalidSequence)
368+
# According to RFC 3629 no point above U+10FFFF should be used, which
369+
# limits characters to four bytes.
370+
if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32):
371+
return err(ErrorInvalidSequence)
372+
373+
let byte3 = uint32(input[index + 1])
374+
if (byte3 and 0xC0'u32) != 0x80'u32:
375+
return err(ErrorInvalidSequence)
376+
377+
let byte4 = uint32(input[index + 2])
378+
if (byte4 and 0xC0'u32) != 0x80'u32:
379+
return err(ErrorInvalidSequence)
380+
381+
if dindex < len(output):
382+
output[dindex] = B(((byte1 and 0x07'u32) shl 18) or
383+
((byte2 and 0x3F'u32) shl 12) or
384+
((byte3 and 0x3F'u32) shl 6) or
385+
(byte4 and 0x3F'u32))
386+
inc(dindex)
387+
else:
388+
return err(ErrorBufferOverflow)
389+
inc(index, 3)
390+
391+
else:
392+
return err(ErrorInvalidSequence)
393+
394+
ok(dindex)
395+
396+
proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B],
397+
input: openarray[A]): UResult[seq[B]] =
398+
## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
399+
## sequence of 32bit limbs and return it.
400+
var empty: array[0, B]
401+
let size = ? utf8toUtf32(input, empty)
402+
var output = newSeq[B](size)
403+
let res {.used.} = ? utf8toUtf32(input, output)
275404
ok(output)
276405

277406
when defined(posix):

tests/test_utf8.nim

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -302,18 +302,40 @@ suite "UTF-8 validation test suite":
302302
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == ""
303303
utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == ""
304304

305-
test "wcharToUtf8() tests":
305+
test "UTF-32 -> UTF-8 conversion test":
306306
for i in 0 ..< 0x11_0000:
307-
if i != 0xFFFE and i != 0xFFFF:
308-
if i < 0x10000:
309-
var data16 = [uint16(i)]
310-
let res = wcharToUtf8(data16)
311-
check:
312-
res.isOk() == true
313-
utf8Validate(res.get()) == true
314-
307+
var data32 = [uint32(i)]
308+
if i >= 0xD800 and i <= 0xDFFF:
309+
check utf32toUtf8(data32).isErr()
310+
elif i == 0xFFFE:
311+
check utf32toUtf8(data32).isErr()
312+
elif i == 0xFFFF:
313+
check utf32toUtf8(data32).isErr()
314+
elif i == 0x11_0000:
315+
check utf32toUtf8(data32).isErr()
316+
else:
315317
var data32 = [uint32(i)]
316-
let res = wcharToUtf8(data32)
318+
let res = utf32toUtf8(data32)
317319
check:
318320
res.isOk() == true
319321
utf8Validate(res.get()) == true
322+
323+
test "UTF-8 -> UTF-32 conversion test":
324+
for i in 0 ..< 0x11_0001:
325+
var data32 = [uint32(i)]
326+
if i >= 0xD800 and i <= 0xDFFF:
327+
check utf32toUtf8(data32).isErr()
328+
elif i == 0xFFFE:
329+
check utf32toUtf8(data32).isErr()
330+
elif i == 0xFFFF:
331+
check utf32toUtf8(data32).isErr()
332+
elif i == 0x11_0000:
333+
check utf32toUtf8(data32).isErr()
334+
else:
335+
var data32 = [uint32(i)]
336+
let res8 = utf32toUtf8(data32)
337+
check res8.isOk()
338+
let res32 = utf8toUtf32(uint32, res8.get())
339+
check:
340+
res32.isOk()
341+
res32.get() == data32

0 commit comments

Comments
 (0)