Skip to content

Commit e516f3f

Browse files
committed
Tweak text detection heuristics
1 parent ea6a195 commit e516f3f

File tree

1 file changed

+7
-35
lines changed

1 file changed

+7
-35
lines changed

src/transform/TextCodec.cpp

Lines changed: 7 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -224,40 +224,10 @@ byte TextCodec::computeStats(const byte block[], int count, uint freqs0[], bool
224224

225225
uint* freqs1 = new uint[65536];
226226
memset(&freqs1[0], 0, 65536 * sizeof(uint));
227-
uint f0[256] = { 0 };
228-
uint f1[256] = { 0 };
229-
uint f3[256] = { 0 };
230-
uint f2[256] = { 0 };
231-
uint8 prv = 0;
232-
const uint8* data = reinterpret_cast<const uint8*>(&block[0]);
233-
const int count4 = count & -4;
234-
235-
// Unroll loop
236-
for (int i = 0; i < count4; i += 4) {
237-
const uint8 cur0 = data[i];
238-
const uint8 cur1 = data[i + 1];
239-
const uint8 cur2 = data[i + 2];
240-
const uint8 cur3 = data[i + 3];
241-
f0[cur0]++;
242-
f1[cur1]++;
243-
f2[cur2]++;
244-
f3[cur3]++;
245-
freqs1[(prv * 256) + cur0]++;
246-
freqs1[(cur0 * 256) + cur1]++;
247-
freqs1[(cur1 * 256) + cur2]++;
248-
freqs1[(cur2 * 256) + cur3]++;
249-
prv = cur3;
250-
}
251-
252-
for (int i = count4; i < count; i++) {
253-
freqs0[data[i]]++;
254-
freqs1[(prv * 256) + data[i]]++;
255-
prv = data[i];
256-
}
227+
Global::computeHistogram(block, count, freqs1, false);
257228

258-
for (int i = 0; i < 256; i++) {
259-
freqs0[i] += (f0[i] + f1[i] + f2[i] + f3[i]);
260-
}
229+
for (int i = 0; i < 65536; i++)
230+
freqs0[i >> 8] += freqs1[i];
261231

262232
const int cr = int(CR);
263233
const int lf = int(LF);
@@ -276,10 +246,12 @@ byte TextCodec::computeStats(const byte block[], int count, uint freqs0[], bool
276246
bool notText = nbBinChars > (count >> 2);
277247

278248
if (notText == false) {
249+
notText = nbTextChars < (count >> 2);
250+
279251
if (strict == true) {
280-
notText = ((nbTextChars < (count >> 2)) || (freqs0[0] >= uint(count / 100)) || ((nbASCII / 95) < (count / 100)));
252+
notText |= ((freqs0[0] >= uint(count / 100)) || ((nbASCII / 95) < (count / 100)));
281253
} else {
282-
notText = ((nbTextChars < (count >> 1)) || (freqs0[32] < uint(count / 50)));
254+
notText |= (freqs0[32] < uint(count / 50));
283255
}
284256
}
285257

0 commit comments

Comments
 (0)