@@ -344,8 +344,8 @@ byte TextCodec::detectType(const uint freqs0[], const uint freqs1[], int count)
344344 if (dt != Global::UNDEFINED)
345345 return TextCodec::MASK_NOT_TEXT | byte (dt);
346346
347- // Check UTF-8
348- // See Unicode 14 Standard - UTF-8 Table 3.7
347+ // Valid UTF-8 sequences
348+ // See Unicode 16 Standard - UTF-8 Table 3.7
349349 // U+0000..U+007F 00..7F
350350 // U+0080..U+07FF C2..DF 80..BF
351351 // U+0800..U+0FFF E0 A0..BF 80..BF
@@ -356,40 +356,72 @@ byte TextCodec::detectType(const uint freqs0[], const uint freqs1[], int count)
356356 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
357357 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
358358
359- if ((freqs0[0xC0 ] > 0 ) || (freqs0[0xC1 ] > 0 ))
360- return TextCodec::MASK_NOT_TEXT;
359+ // Check rules for 1 byte
360+ uint sum = freqs0[0xC0 ] + freqs0[0xC1 ];
361+ uint sum2 = 0 ;
362+ bool res = true ;
361363
362- for (int i = 0xF5 ; i <= 0xFF ; i++) {
363- if (freqs0[i] > 0 )
364- return TextCodec::MASK_NOT_TEXT;
365- }
364+ for (int i = 0xF5 ; i <= 0xFF ; i++)
365+ sum += freqs0[i];
366366
367- int sum = 0 ;
367+ if (sum != 0 ) {
368+ res = false ;
369+ goto end;
370+ }
368371
372+ // Check rules for first 2 bytes
369373 for (int i = 0 ; i < 256 ; i++) {
370374 // Exclude < 0xE0A0 || > 0xE0BF
371- if ((( i < 0xA0 ) || (i > 0xBF )) && (freqs1[( 0xE0 << 8 ) + i] > 0 ))
372- return TextCodec::MASK_NOT_TEXT ;
375+ if ((i < 0xA0 ) || (i > 0xBF ))
376+ sum += freqs1[ 0xE0 * 256 + i] ;
373377
374378 // Exclude < 0xED80 || > 0xEDE9F
375- if ((( i < 0x80 ) || (i > 0x9F )) && (freqs1[( 0xED << 8 ) + i] > 0 ))
376- return TextCodec::MASK_NOT_TEXT ;
379+ if ((i < 0x80 ) || (i > 0x9F ))
380+ sum += freqs1[ 0xED * 256 + i] ;
377381
378382 // Exclude < 0xF090 || > 0xF0BF
379- if (((i < 0x90 ) || (i > 0xBF )) && (freqs1[(0xF0 << 8 ) + i] > 0 ))
380- return TextCodec::MASK_NOT_TEXT;
383+ if ((i < 0x90 ) || (i > 0xBF ))
384+ sum += freqs1[0xF0 * 256 + i];
385+
386+ // Exclude < 0xF480 || > 0xF48F
387+ if ((i < 0x80 ) || (i > 0x8F ))
388+ sum += freqs1[0xF4 * 256 + i];
389+
390+ if ((i < 0x80 ) || (i > 0xBF )) {
391+ // Exclude < 0x??80 || > 0x??BF with ?? in [C2..DF]
392+ for (int j = 0xC2 ; j <= 0xDF ; j++)
393+ sum += freqs1[j * 256 + i];
394+
395+ // Exclude < 0x??80 || > 0x??BF with ?? in [E1..EC]
396+ for (int j = 0xE1 ; j <= 0xEC ; j++)
397+ sum += freqs1[j * 256 + i];
398+
399+ // Exclude < 0x??80 || > 0x??BF with ?? in [F1..F3]
400+ sum += freqs1[0xF1 * 256 + i];
401+ sum += freqs1[0xF2 * 256 + i];
402+ sum += freqs1[0xF3 * 256 + i];
381403
382- // Exclude < 0xF480 || > 0xF4BF
383- if (((i < 0x80 ) || (i > 0xBF )) && (freqs1[(0xF4 << 8 ) + i] > 0 ))
384- return TextCodec::MASK_NOT_TEXT;
404+ // Exclude < 0xEE80 || > 0xEEBF
405+ sum += freqs1[0xEE * 256 + i];
385406
386- // Count non-primary bytes
387- if ((i >= 0x80 ) && (i <= 0xBF ))
388- sum += freqs0[i];
407+ // Exclude < 0xEF80 || > 0xEFBF
408+ sum += freqs1[0xEF * 256 + i];
409+ }
410+ else {
411+ // Count non-primary bytes
412+ sum2 += freqs0[i];
413+ }
414+
415+ if (sum != 0 ) {
416+ res = false ;
417+ break ;
418+ }
389419 }
390420
391- // Another ad-hoc threshold
392- return (sum < count / 4 ) ? TextCodec::MASK_NOT_TEXT : TextCodec::MASK_NOT_TEXT | byte (Global::UTF8);
421+ end:
422+ // Ad-hoc threshold
423+ res &= (sum2 >= uint (count / 8 ));
424+ return res == true ? TextCodec::MASK_NOT_TEXT | byte (Global::UTF8) : TextCodec::MASK_NOT_TEXT;
393425}
394426
395427
0 commit comments