@@ -90,26 +90,30 @@ size_t utf8_conv_utf32(uint32_t *out, size_t out_chars,
9090 while (in_size && out_chars )
9191 {
9292 uint32_t c ;
93- uint8_t first = ( uint8_t ) * in ++ ;
93+ uint8_t first ;
9494 unsigned ones ;
9595
96- /* Fast path: ASCII (most common case) */
97- if ( first < 0x80 )
96+ /* Fast path: batch ASCII characters */
97+ while ( in_size && out_chars && ( uint8_t ) * in < 0x80 )
9898 {
99- * out ++ = first ;
99+ * out ++ = ( uint8_t ) * in ++ ;
100100 in_size -- ;
101101 out_chars -- ;
102102 ret ++ ;
103- continue ;
104103 }
105104
106- ones = utf8_lut [first ];
105+ if (!in_size || !out_chars )
106+ break ;
107+
108+ first = (uint8_t )* in ++ ;
109+ ones = utf8_lut [first ];
107110
108- if (ones > 6 || ones == 1 ) /* Invalid or desync. */
111+ if (ones > 6 || ones < 2 ) /* Invalid or desync. */
109112 break ;
110113
111- /* ones >= 2 here; extra = ones - 1 */
112- if (ones > in_size ) /* Overflow. */
114+ /* ones includes the lead byte; we already consumed it,
115+ * but need (ones - 1) more continuation bytes */
116+ if (ones > in_size ) /* Not enough data. */
113117 break ;
114118
115119 /* Decode based on sequence length to avoid inner loop */
@@ -210,14 +214,19 @@ bool utf16_conv_utf8(uint8_t *out, size_t *out_chars,
210214 * out_chars = out_pos ;
211215 return true;
212216 }
213- value = in [in_pos ++ ];
214217
215- if (value < 0x80 )
218+ /* Batch ASCII run: avoid per-char branch into multi-byte path */
219+ while (in_pos < in_size && in [in_pos ] < 0x80 )
220+ out [out_pos ++ ] = (uint8_t )in [in_pos ++ ];
221+
222+ if (in_pos == in_size )
216223 {
217- out [ out_pos ++ ] = ( uint8_t ) value ;
218- continue ;
224+ * out_chars = out_pos ;
225+ return true ;
219226 }
220227
228+ value = in [in_pos ++ ];
229+
221230 if (value >= 0xD800 && value < 0xE000 )
222231 {
223232 uint32_t c2 ;
@@ -323,13 +332,21 @@ const char *utf8skip(const char *str, size_t chars)
323332
324333 do
325334 {
326- unsigned ones = utf8_lut [* strb ];
335+ unsigned ones ;
336+ if (!* strb )
337+ break ;
338+ ones = utf8_lut [* strb ];
327339 if (ones < 2 )
328340 strb ++ ;
329341 else
330- strb += ones ;
331- chars -- ;
332- } while (chars );
342+ {
343+ /* Verify we don't walk past a NUL inside a multi-byte seq */
344+ unsigned i ;
345+ for (i = 0 ; i < ones && strb [i ]; i ++ )
346+ ;
347+ strb += i ;
348+ }
349+ } while (-- chars );
333350
334351 return (const char * )strb ;
335352}
@@ -375,37 +392,51 @@ size_t utf8len(const char *string)
375392 **/
376393uint32_t utf8_walk (const char * * string )
377394{
378- uint8_t first = UTF8_WALKBYTE (string );
379- uint32_t ret = 0 ;
395+ const uint8_t * s = (const uint8_t * )* string ;
396+ uint8_t first = * s ++ ;
397+ uint32_t ret ;
380398
381- if (first < 128 )
399+ if (first < 0x80 )
400+ {
401+ * string = (const char * )s ;
382402 return first ;
403+ }
383404
384- ret = (ret << 6 ) | (UTF8_WALKBYTE (string ) & 0x3F );
385- if (first >= 0xE0 )
405+ /* Use LUT + switch to decode, matching utf8_conv_utf32 style */
406+ ret = first & ((1 << (7 - utf8_lut [first ])) - 1 );
407+ switch (utf8_lut [first ])
386408 {
387- ret = (ret << 6 ) | (UTF8_WALKBYTE (string ) & 0x3F );
388- if (first >= 0xF0 )
389- {
390- ret = (ret << 6 ) | (UTF8_WALKBYTE (string ) & 0x3F );
391- return ret | (first & 7 ) << 18 ;
392- }
393- return ret | (first & 15 ) << 12 ;
409+ case 4 :
410+ ret = (ret << 6 ) | (* s ++ & 0x3F );
411+ /* fall through */
412+ case 3 :
413+ ret = (ret << 6 ) | (* s ++ & 0x3F );
414+ /* fall through */
415+ case 2 :
416+ ret = (ret << 6 ) | (* s ++ & 0x3F );
417+ break ;
418+ default :
419+ break ;
394420 }
395421
396- return ret | (first & 31 ) << 6 ;
422+ * string = (const char * )s ;
423+ return ret ;
397424}
398425
399426static bool utf16_to_char (uint8_t * * utf_data ,
400427 size_t * dest_len , const uint16_t * in )
401428{
402- size_t _len = 0 ;
403- while (in [_len ] != '\0' )
404- _len ++ ;
405- utf16_conv_utf8 (NULL , dest_len , in , _len );
406- * dest_len += 1 ;
407- if ((* utf_data = (uint8_t * )malloc (* dest_len )) != 0 )
408- return utf16_conv_utf8 (* utf_data , dest_len , in , _len );
429+ const uint16_t * p = in ;
430+ /* Find length in a single scan */
431+ while (* p != 0 )
432+ p ++ ;
433+ {
434+ size_t in_len = (size_t )(p - in );
435+ utf16_conv_utf8 (NULL , dest_len , in , in_len );
436+ * dest_len += 1 ;
437+ if ((* utf_data = (uint8_t * )malloc (* dest_len )) != 0 )
438+ return utf16_conv_utf8 (* utf_data , dest_len , in , in_len );
439+ }
409440 return false;
410441}
411442
0 commit comments