Skip to content

Commit a5531ae

Browse files
committed
Sync UTF8 validation code
1 parent 884099c commit a5531ae

File tree

2 files changed

+58
-23
lines changed

2 files changed

+58
-23
lines changed

src/transform/TextCodec.cpp

Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -344,8 +344,8 @@ byte TextCodec::detectType(const uint freqs0[], const uint freqs1[], int count)
344344
if (dt != Global::UNDEFINED)
345345
return TextCodec::MASK_NOT_TEXT | byte(dt);
346346

347-
// Check UTF-8
348-
// See Unicode 14 Standard - UTF-8 Table 3.7
347+
// Valid UTF-8 sequences
348+
// See Unicode 16 Standard - UTF-8 Table 3.7
349349
// U+0000..U+007F 00..7F
350350
// U+0080..U+07FF C2..DF 80..BF
351351
// U+0800..U+0FFF E0 A0..BF 80..BF
@@ -356,40 +356,72 @@ byte TextCodec::detectType(const uint freqs0[], const uint freqs1[], int count)
356356
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
357357
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
358358

359-
if ((freqs0[0xC0] > 0) || (freqs0[0xC1] > 0))
360-
return TextCodec::MASK_NOT_TEXT;
359+
// Check rules for 1 byte
360+
uint sum = freqs0[0xC0] + freqs0[0xC1];
361+
uint sum2 = 0;
362+
bool res = true;
361363

362-
for (int i = 0xF5; i <= 0xFF; i++) {
363-
if (freqs0[i] > 0)
364-
return TextCodec::MASK_NOT_TEXT;
365-
}
364+
for (int i = 0xF5; i <= 0xFF; i++)
365+
sum += freqs0[i];
366366

367-
int sum = 0;
367+
if (sum != 0) {
368+
res = false;
369+
goto end;
370+
}
368371

372+
// Check rules for first 2 bytes
369373
for (int i = 0; i < 256; i++) {
370374
// Exclude < 0xE0A0 || > 0xE0BF
371-
if (((i < 0xA0) || (i > 0xBF)) && (freqs1[(0xE0 << 8) + i] > 0))
372-
return TextCodec::MASK_NOT_TEXT;
375+
if ((i < 0xA0) || (i > 0xBF))
376+
sum += freqs1[0xE0 * 256 + i];
373377

374378
// Exclude < 0xED80 || > 0xEDE9F
375-
if (((i < 0x80) || (i > 0x9F)) && (freqs1[(0xED << 8) + i] > 0))
376-
return TextCodec::MASK_NOT_TEXT;
379+
if ((i < 0x80) || (i > 0x9F))
380+
sum += freqs1[0xED * 256 + i];
377381

378382
// Exclude < 0xF090 || > 0xF0BF
379-
if (((i < 0x90) || (i > 0xBF)) && (freqs1[(0xF0 << 8) + i] > 0))
380-
return TextCodec::MASK_NOT_TEXT;
383+
if ((i < 0x90) || (i > 0xBF))
384+
sum += freqs1[0xF0 * 256 + i];
385+
386+
// Exclude < 0xF480 || > 0xF48F
387+
if ((i < 0x80) || (i > 0x8F))
388+
sum += freqs1[0xF4 * 256 + i];
389+
390+
if ((i < 0x80) || (i > 0xBF)) {
391+
// Exclude < 0x??80 || > 0x??BF with ?? in [C2..DF]
392+
for (int j = 0xC2; j <= 0xDF; j++)
393+
sum += freqs1[j * 256 + i];
394+
395+
// Exclude < 0x??80 || > 0x??BF with ?? in [E1..EC]
396+
for (int j = 0xE1; j <= 0xEC; j++)
397+
sum += freqs1[j * 256 + i];
398+
399+
// Exclude < 0x??80 || > 0x??BF with ?? in [F1..F3]
400+
sum += freqs1[0xF1 * 256 + i];
401+
sum += freqs1[0xF2 * 256 + i];
402+
sum += freqs1[0xF3 * 256 + i];
381403

382-
// Exclude < 0xF480 || > 0xF4BF
383-
if (((i < 0x80) || (i > 0xBF)) && (freqs1[(0xF4 << 8) + i] > 0))
384-
return TextCodec::MASK_NOT_TEXT;
404+
// Exclude < 0xEE80 || > 0xEEBF
405+
sum += freqs1[0xEE * 256 + i];
385406

386-
// Count non-primary bytes
387-
if ((i >= 0x80) && (i <= 0xBF))
388-
sum += freqs0[i];
407+
// Exclude < 0xEF80 || > 0xEFBF
408+
sum += freqs1[0xEF * 256 + i];
409+
}
410+
else {
411+
// Count non-primary bytes
412+
sum2 += freqs0[i];
413+
}
414+
415+
if (sum != 0) {
416+
res = false;
417+
break;
418+
}
389419
}
390420

391-
// Another ad-hoc threshold
392-
return (sum < count / 4) ? TextCodec::MASK_NOT_TEXT : TextCodec::MASK_NOT_TEXT | byte(Global::UTF8);
421+
end:
422+
// Ad-hoc threshold
423+
res &= (sum2 >= uint(count / 8));
424+
return res == true ? TextCodec::MASK_NOT_TEXT | byte(Global::UTF8) : TextCodec::MASK_NOT_TEXT;
393425
}
394426

395427

src/transform/UTFCodec.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ bool UTFCodec::forward(SliceArray<byte>& input, SliceArray<byte>& output, int co
8989
if ((mustValidate == true) && (validate(&src[start], count - start - 4)) == false)
9090
return false;
9191

92+
if (_pCtx != nullptr)
93+
_pCtx->putInt("dataType", Global::UTF8);
94+
9295
// 1-3 bit size + (7 or 11 or 16 or 21) bit payload
9396
// 3 MSBs indicate symbol size (limit map size to 22 bits)
9497
// 000 -> 7 bits

0 commit comments

Comments
 (0)