Skip to content

Commit e068a96

Browse files
committed
Fixed bug in textract parsing
1 parent 1794667 commit e068a96

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

js/import/importOCR.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,12 @@ export async function importOCRFiles(ocrFilesAll) {
4242
const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
4343

4444
// Check whether input is Abbyy XML
45+
// TODO: The auto-detection of formats needs to be more robust.
46+
// At present, any string that contains ">" and "abbyy" is considered Abbyy XML.
4547
const node2 = hocrStrAll.match(/>([^>]+)/)?.[1];
4648
abbyyMode = !!node2 && !!/abbyy/i.test(node2);
4749
stextMode = !!node2 && !!/<document name/.test(node2);
48-
textractMode = !node2 && !!/"AnalyzeDocumentModelVersion"/i.test(hocrStrAll);
50+
textractMode = !abbyyMode && !stextMode && !!/"AnalyzeDocumentModelVersion"/i.test(hocrStrAll);
4951

5052
if (textractMode) {
5153
hocrRaw = [hocrStrAll];
@@ -68,7 +70,7 @@ export async function importOCRFiles(ocrFilesAll) {
6870
const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
6971
const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1];
7072
abbyyMode = !!node2 && !!/abbyy/i.test(node2);
71-
textractMode = !node2 && !!/"AnalyzeDocumentModelVersion"/i.test(hocrStrFirst);
73+
textractMode = !abbyyMode && !!/"AnalyzeDocumentModelVersion"/i.test(hocrStrFirst);
7274

7375
for (let i = 0; i < pageCountHOCR; i++) {
7476
const hocrFile = ocrFilesAll[i];

0 commit comments

Comments
 (0)