@@ -12,7 +12,8 @@ export results
1212
1313type
1414 UResult * [T] = Result [T, cstring ]
15- Wides * = int16 | uint16 | int32 | uint32
15+ Wides32 * = int32 | uint32
16+ Wides16 * = int16 | uint16
1617 Bytes * = int8 | char | uint8 | byte
1718
1819const
@@ -206,72 +207,200 @@ proc utf8Substr*[T: Bytes](data: openarray[T],
206207 inc (k)
207208 ok (res)
208209
209- proc wcharToUtf8 * [A: Wides , B: Bytes ](input: openarray [A],
210+ proc utf32toUtf8 * [A: Wides32 , B: Bytes ](input: openarray [A],
210211 output: var openarray [B]): UResult [int ] =
211- # # Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
212- # #
213- # # Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
212+ # # Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
214213 var offset = 0
215214 for item in input:
216- let uitem = uint (item)
217215 let codepoint =
218- if uitem >= 0x D800 'u and uitem <= 0x DBFF 'u :
219- 0x 10000 'u + ((uitem - 0x D800 'u ) shl 10 )
220- else :
221- if uitem >= 0x DC00 'u and uitem <= 0x DFFF 'u :
222- uitem - 0x DC00 'u
223- else :
224- uitem
225- if codepoint <= 0x 7F 'u :
216+ block :
217+ if (uint32 (item) >= 0x D800 'u32 ) and (uint32 (item) <= 0x DFFF 'u32 ):
218+ # high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
219+ return err (ErrorInvalidSequence )
220+ elif (uint32 (item) == 0x FFFE 'u32 ) or (uint32 (item) == 0x FFFF 'u32 ):
221+ # these codes are intended for process-internal uses, and not a
222+ # unicode characters.
223+ return err (ErrorInvalidSequence )
224+ uint32 (item)
225+ if codepoint <= 0x 7F 'u32 :
226226 if len (output) > 0 :
227227 if offset < len (output):
228- output [offset] = cast [B](codepoint and 0x 7F 'u )
228+ output [offset] = cast [B](codepoint and 0x 7F 'u32 )
229229 else :
230230 return err (ErrorBufferOverflow )
231231 inc (offset, 1 )
232- elif codepoint <= 0x 7FF 'u :
232+ elif codepoint <= 0x 7FF 'u32 :
233233 if len (output) > 0 :
234234 if offset + 1 < len (output):
235235 output [offset + 0 ] = cast [B](0x C0 'u8 or
236- byte ((codepoint shr 6 ) and 0x 1F 'u ))
237- output [offset + 1 ] = cast [B](0x 80 'u8 or byte (codepoint and 0x 3F 'u ))
236+ byte ((codepoint shr 6 ) and 0x 1F 'u32 ))
237+ output [offset + 1 ] = cast [B](0x 80 'u8 or byte (codepoint and 0x 3F 'u32 ))
238238 else :
239239 return err (ErrorBufferOverflow )
240240 inc (offset, 2 )
241- elif codepoint <= 0x FFFF 'u :
241+ elif codepoint <= 0x FFFF 'u32 :
242242 if len (output) > 0 :
243243 if offset + 2 < len (output):
244244 output [offset + 0 ] = cast [B](0x E0 'u8 or
245- byte ((codepoint shr 12 ) and 0x 0F 'u ))
245+ byte ((codepoint shr 12 ) and 0x 0F 'u32 ))
246246 output [offset + 1 ] = cast [B](0x 80 'u8 or
247- byte ((codepoint shr 6 ) and 0x 3F 'u ))
248- output [offset + 2 ] = cast [B](0x 80 'u8 or byte (codepoint and 0x 3F 'u ))
247+ byte ((codepoint shr 6 ) and 0x 3F 'u32 ))
248+ output [offset + 2 ] = cast [B](0x 80 'u8 or byte (codepoint and 0x 3F 'u32 ))
249249 else :
250250 return err (ErrorBufferOverflow )
251251 inc (offset, 3 )
252- elif codepoint <= 0x 10FFFF 'u :
252+ elif codepoint <= 0x 10FFFF 'u32 :
253253 if len (output) > 0 :
254254 if offset + 3 < len (output):
255255 output [offset + 0 ] = cast [B](0x F0 'u8 or
256- byte ((codepoint shr 18 ) and 0x 07 'u ))
256+ byte ((codepoint shr 18 ) and 0x 07 'u32 ))
257257 output [offset + 1 ] = cast [B](0x 80 'u8 or
258- byte ((codepoint shr 12 ) and 0x 3F 'u ))
258+ byte ((codepoint shr 12 ) and 0x 3F 'u32 ))
259259 output [offset + 2 ] = cast [B](0x 80 'u8 or
260- byte ((codepoint shr 6 ) and 0x 3F 'u ))
261- output [offset + 3 ] = cast [B](0x 80 'u8 or byte (codepoint and 0x 3F 'u ))
260+ byte ((codepoint shr 6 ) and 0x 3F 'u32 ))
261+ output [offset + 3 ] = cast [B](0x 80 'u8 or byte (codepoint and 0x 3F 'u32 ))
262262 else :
263- return err (" " )
263+ return err (ErrorBufferOverflow )
264264 inc (offset, 4 )
265265 else :
266266 return err (ErrorInvalidSequence )
267267 ok (offset)
268268
269- proc wcharToUtf8 * [T: Wides ](input: openarray [T]): UResult [string ] {.inline .} =
270- # # Converts wide character
269+ proc utf32toUtf8 * [T: Wides32 ](input: openarray [T]): UResult [string ] {.inline .} =
270+ # # Converts wide character sequence ``input`` to UTF-8 encoded string.
271271 var empty: array [0 , char ]
272- let size = ? wcharToUtf8 (input, empty)
272+ let size = ? utf32ToUtf8 (input, empty)
273273 var output = newString (size)
274- let res {.used .} = ? wcharToUtf8 (input, output)
274+ let res {.used .} = ? utf32ToUtf8 (input, output)
275+ ok (output)
276+
277+ proc utf8toUtf32 * [A: Bytes , B: Wides32 ](input: openarray [A],
278+ output: var openarray [B]): UResult [int ] =
279+ # # Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
280+ # # sequences of 32bit limbs.
281+ # #
282+ # # To obtain required size of ``output`` you need to pass ``output`` as
283+ # # zero-length array, in such way required size will be returned as result of
284+ # # procedure.
285+ # #
286+ # # If size of ``output`` is not zero, and there not enough space in ``output``
287+ # # array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
288+ # # be returned.
289+ var index = 0
290+ var dindex = 0
291+ if len (output) == 0 :
292+ return utf8Length (input)
293+ else :
294+ while true :
295+ if index >= len (input):
296+ break
297+ let byte1 = uint32 (input[index])
298+ inc (index)
299+
300+ if (byte1 and 0x 80 ) == 0x 00 :
301+ if dindex < len (output):
302+ output[dindex] = B (byte1)
303+ inc (dindex)
304+ else :
305+ return err (ErrorBufferOverflow )
306+ elif (byte1 and 0x E0 'u32 ) == 0x C0 'u32 :
307+ # Two-byte form (110xxxxx 10xxxxxx)
308+ if index >= len (input):
309+ return err (ErrorInvalidSequence )
310+ # overlong sequence test
311+ if (byte1 and 0x FE 'u32 ) == 0x C0 'u32 :
312+ return err (ErrorInvalidSequence )
313+
314+ let byte2 = uint32 (input[index])
315+ if (byte2 and 0x C0 'u32 ) != 0x 80 'u32 :
316+ return err (ErrorInvalidSequence )
317+
318+ if dindex < len (output):
319+ output[dindex] = B (((byte1 and 0x 1F 'u32 ) shl 6 ) or
320+ (byte2 and 0x 3F 'u32 ))
321+ inc (dindex)
322+ else :
323+ return err (ErrorBufferOverflow )
324+ inc (index)
325+ elif (byte1 and 0x F0 'u32 ) == 0x E0 'u32 :
326+ # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
327+ if (index + 1 ) >= len (input):
328+ return err (ErrorInvalidSequence )
329+
330+ let byte2 = uint32 (input[index])
331+ if (byte2 and 0x C0 'u32 ) != 0x 80 'u32 :
332+ return err (ErrorInvalidSequence )
333+ # overlong sequence test
334+ if (byte1 == 0x E0 'u32 ) and ((byte2 and 0x E0 'u32 ) == 0x 80 'u32 ):
335+ return err (ErrorInvalidSequence )
336+ # 0xD800–0xDFFF (UTF-16 surrogates) test
337+ if (byte1 == 0x ED 'u32 ) and ((byte2 and 0x E0 'u32 ) == 0x A0 'u32 ):
338+ return err (ErrorInvalidSequence )
339+
340+ let byte3 = uint32 (input[index + 1 ])
341+ if (byte3 and 0x C0 'u32 ) != 0x 80 'u32 :
342+ return err (ErrorInvalidSequence )
343+ # U+FFFE or U+FFFF test
344+ if (byte1 == 0x EF 'u32 ) and (byte2 == 0x BF 'u32 ) and
345+ ((byte3 and 0x FE 'u32 ) == 0x BE 'u32 ):
346+ return err (ErrorInvalidSequence )
347+
348+ if dindex < len (output):
349+ output[dindex] = B (((byte1 and 0x 0F 'u32 ) shl 12 ) or
350+ ((byte2 and 0x 3F 'u32 ) shl 6 ) or
351+ (byte3 and 0x 3F 'u32 ))
352+ inc (dindex)
353+ else :
354+ return err (ErrorBufferOverflow )
355+ inc (index, 2 )
356+
357+ elif (byte1 and 0x F8 'u8 ) == 0x F0 'u8 :
358+ # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
359+ if (index + 2 ) >= len (input):
360+ return err (ErrorInvalidSequence )
361+
362+ let byte2 = uint32 (input[index])
363+ if (byte2 and 0x C0 'u32 ) != 0x 80 'u32 :
364+ return err (ErrorInvalidSequence )
365+ # overlong sequence test
366+ if (byte1 == 0x F0 'u32 ) and ((byte2 and 0x F0 'u32 ) == 0x 80 'u32 ):
367+ return err (ErrorInvalidSequence )
368+ # According to RFC 3629 no point above U+10FFFF should be used, which
369+ # limits characters to four bytes.
370+ if ((byte1 == 0x F4 'u32 ) and (byte2 > 0x 8F 'u32 )) or (byte1 > 0x F4 'u32 ):
371+ return err (ErrorInvalidSequence )
372+
373+ let byte3 = uint32 (input[index + 1 ])
374+ if (byte3 and 0x C0 'u32 ) != 0x 80 'u32 :
375+ return err (ErrorInvalidSequence )
376+
377+ let byte4 = uint32 (input[index + 2 ])
378+ if (byte4 and 0x C0 'u32 ) != 0x 80 'u32 :
379+ return err (ErrorInvalidSequence )
380+
381+ if dindex < len (output):
382+ output[dindex] = B (((byte1 and 0x 07 'u32 ) shl 18 ) or
383+ ((byte2 and 0x 3F 'u32 ) shl 12 ) or
384+ ((byte3 and 0x 3F 'u32 ) shl 6 ) or
385+ (byte4 and 0x 3F 'u32 ))
386+ inc (dindex)
387+ else :
388+ return err (ErrorBufferOverflow )
389+ inc (index, 3 )
390+
391+ else :
392+ return err (ErrorInvalidSequence )
393+
394+ ok (dindex)
395+
396+ proc utf8toUtf32 * [A: Bytes , B: Wides32 ](et: typedesc [B],
397+ input: openarray [A]): UResult [seq [B]] =
398+ # # Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
399+ # # sequence of 32bit limbs and return it.
400+ var empty: array [0 , B]
401+ let size = ? utf8toUtf32 (input, empty)
402+ var output = newSeq [B](size)
403+ let res {.used .} = ? utf8toUtf32 (input, output)
275404 ok (output)
276405
277406when defined (posix):
0 commit comments