Skip to content

Commit 5fc78c9

Browse files
committed
Updated CLI to support multiple OCR files for single document
1 parent c4bee94 commit 5fc78c9

File tree

5 files changed

+94
-68
lines changed

5 files changed

+94
-68
lines changed

cli/cli.js

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,36 @@ import {
77
evalInternal, overlay, recognize,
88
} from './main.js';
99

10-
export const confCLI = async (ocrFile) => {
11-
await conf(ocrFile);
10+
/**
11+
* Print confidence of Abbyy .xml file.
12+
*
13+
* @param {string[]} files - Paths to input files.
14+
*/
15+
export const confCLI = async (files) => {
16+
await conf(files);
1217
process.exitCode = 0;
1318
};
1419

15-
export const checkCLI = async (pdfFile, ocrFile, options) => {
16-
await check(pdfFile, ocrFile, options);
20+
/**
21+
*
22+
* @param {string[]} files - Paths to input files.
23+
* @param {Object} options
24+
* @param {number} [options.workers]
25+
*/
26+
export const checkCLI = async (files, options) => {
27+
await check(files, options);
1728
process.exitCode = 0;
1829
};
1930

20-
export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
21-
const { evalMetrics } = await evalInternal(pdfFile, ocrFile, options);
31+
/**
32+
* Evaluate internal OCR engine.
33+
*
34+
* @param {string[]} files - Paths to input files.
35+
* @param {Object} options
36+
* @param {number} [options.workers]
37+
*/
38+
export const evalInternalCLI = async (files, options) => {
39+
const { evalMetrics } = await evalInternal(files, options);
2240

2341
const ignoreExtra = true;
2442
let metricWER;
@@ -57,28 +75,38 @@ export const detectPDFTypeCLI = async (pdfFile, outputPath) => {
5775

5876
/**
5977
*
60-
* @param {string} pdfFile - Path to PDF file.
61-
* @param {*} ocrFile
62-
* @param {*} outputDir
78+
* @param {string[]} files - Paths to input files.
6379
* @param {Object} options
80+
* @param {string} [options.output] - Output directory for the resulting PDF.
6481
* @param {boolean} [options.robust]
6582
* @param {boolean} [options.conf]
6683
* @param {boolean} [options.vis]
6784
* @param {number} [options.workers]
6885
*/
69-
export const overlayCLI = async (pdfFile, ocrFile, outputDir, options) => {
86+
export const overlayCLI = async (files, options) => {
7087
options.overlayMode = options.vis ? 'proof' : 'invis';
71-
await overlay(pdfFile, ocrFile, outputDir, options);
88+
await overlay(files, options.output, options);
7289
process.exitCode = 0;
7390
};
7491

75-
export const recognizeCLI = async (pdfFile, options) => {
92+
/**
93+
*
94+
* @param {string[]} files - Paths to input files.
95+
* @param {*} options
96+
*/
97+
export const recognizeCLI = async (files, options) => {
7698
options.overlayMode = options.vis ? 'proof' : 'invis';
77-
await recognize(pdfFile, options);
99+
await recognize(files, options);
78100
process.exitCode = 0;
79101
};
80102

81-
export const debugCLI = async (pdfFile, outputDir, options) => {
82-
await debug(pdfFile, outputDir, options);
103+
/**
104+
*
105+
* @param {string[]} files - Paths to input files.
106+
* @param {*} outputDir
107+
* @param {*} options
108+
*/
109+
export const debugCLI = async (files, outputDir, options) => {
110+
await debug(files, outputDir, options);
83111
process.exitCode = 0;
84112
};

cli/extract.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export const extract = async (pdfFile, output, options) => {
1515

1616
output = output || '.';
1717
const outputDir = path.dirname(output);
18-
const outputFile = outputDir === output ? `${path.basename(pdfFile).replace(/\.\w{1,5}$/i, `.${format}`)}` : path.basename(output);
18+
const outputFile = outputDir === output ? `${path.basename(pdfFile).replace(/\.\w{1,6}$/i, `.${format}`)}` : path.basename(output);
1919
const outputPath = `${outputDir}/${outputFile}`;
2020

2121
scribe.opt.reflow = true;

cli/main.js

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ scribe.opt.saveDebugImages = debugMode;
1313
/**
1414
* @param {string} func
1515
* @param {Object} params
16-
* @param {string} [params.pdfFile]
17-
* @param {string} [params.ocrFile]
16+
* @param {string[]} [params.files]
1817
* @param {string} [params.outputDir]
1918
* @param {Array<string>} [params.list]
2019
* @param {boolean} [params.robustConfMode]
@@ -26,6 +25,10 @@ scribe.opt.saveDebugImages = debugMode;
2625
async function main(func, params) {
2726
scribe.opt.workerN = params.workerN || null;
2827

28+
if (!params.files || params.files.length === 0) {
29+
throw new Error('No input files provided.');
30+
}
31+
2932
await scribe.init({
3033
pdf: true,
3134
ocr: true,
@@ -39,15 +42,9 @@ async function main(func, params) {
3942

4043
const output = {};
4144

42-
const files = [];
43-
if (params.pdfFile) files.push(params.pdfFile);
44-
if (params.ocrFile) files.push(params.ocrFile);
45-
await scribe.importFiles(files);
45+
await scribe.importFiles(params.files);
4646

47-
const backgroundArg = params.pdfFile;
48-
const backgroundStem = backgroundArg ? path.basename(backgroundArg).replace(/\.\w{1,5}$/i, '') : undefined;
49-
const ocrStem = params.ocrFile ? path.basename(params.ocrFile).replace(/\.\w{1,5}$/i, '') : undefined;
50-
const outputStem = backgroundStem || ocrStem || 'output';
47+
const outputStem = scribe.inputData.defaultDownloadFileName.replace(/\.\w{1,6}$/i, '') || 'output';
5148

5249
const outputDir = params.outputDir || '.';
5350

@@ -75,23 +72,30 @@ async function main(func, params) {
7572
}
7673
}
7774

78-
if (['overlay', 'recognize'].includes(func) && backgroundArg) {
75+
if (['overlay', 'recognize'].includes(func) && (scribe.inputData.pdfMode || scribe.inputData.imageMode)) {
7976
let outputSuffix = '';
8077
if (scribe.opt.displayMode === 'proof') {
8178
outputSuffix = '_vis';
8279
} else if (scribe.opt.displayMode === 'invis') {
83-
const resolvedInputFile = path.dirname(path.resolve(backgroundArg));
84-
const resolvedOutputDir = path.resolve(outputDir);
85-
if (resolvedInputFile === resolvedOutputDir) {
86-
outputSuffix = '_ocr';
80+
81+
// Check if output file would overwrite any input file, and if so, add a suffix to avoid overwriting.
82+
// This software is still in development--nobody should be ovewriting input files.
83+
const resolvedOutputFileTmp = path.resolve(`${outputDir}/${outputStem}.pdf`);
84+
for (let i = 0; i < params.files.length; i++) {
85+
const resolvedInputFile = path.resolve(params.files[i]);
86+
if (resolvedInputFile === resolvedOutputFileTmp) {
87+
outputSuffix = '_ocr';
88+
console.log(`Saving output with ${outputSuffix} suffix to avoid overwriting input: ${resolvedInputFile}`);
89+
break;
90+
}
8791
}
8892
}
8993

90-
const outputPath = `${outputDir}/${path.basename(backgroundArg).replace(/\.\w{1,5}$/i, `${outputSuffix}.pdf`)}`;
94+
const outputPath = path.resolve(`${outputDir}/${outputStem}${outputSuffix}.pdf`);
9195
await scribe.download('pdf', outputPath);
9296

9397
if (params.hocr) {
94-
const outputPathHocr = `${outputDir}/${path.basename(backgroundArg).replace(/\.\w{1,5}$/i, '.hocr')}`;
98+
const outputPathHocr = path.resolve(`${outputDir}/${outputStem}.hocr`);
9599
await scribe.download('hocr', outputPathHocr);
96100
}
97101
}
@@ -123,63 +127,59 @@ async function main(func, params) {
123127
/**
124128
* Print confidence of Abbyy .xml file.
125129
*
126-
* @param {string} ocrFile
130+
* @param {string[]} files - Paths to input files.
127131
*/
128-
export const conf = async (ocrFile) => (main('conf', { ocrFile }));
132+
export const conf = async (files) => (main('conf', { files }));
129133

130134
/**
131135
*
132-
* @param {string} pdfFile - Path to PDF file.
133-
* @param {string} ocrFile
136+
* @param {string[]} files - Paths to input files.
134137
* @param {Object} options
135138
* @param {number} [options.workers]
136139
*/
137-
export const check = async (pdfFile, ocrFile, options) => (main('check', { pdfFile, ocrFile, workerN: options?.workers }));
140+
export const check = async (files, options) => (main('check', { files, workerN: options?.workers }));
138141

139142
/**
140143
* Evaluate internal OCR engine.
141144
*
142-
* @param {string} pdfFile - Path to PDF file.
143-
* @param {string} ocrFile - Path to OCR file containing ground truth.
145+
* @param {string[]} files - Paths to input files.
144146
* @param {Object} options
145147
* @param {number} [options.workers]
146148
*/
147-
export const evalInternal = async (pdfFile, ocrFile, options) => (main('eval', { pdfFile, ocrFile, workerN: options?.workers }));
149+
export const evalInternal = async (files, options) => (main('eval', { files, workerN: options?.workers }));
148150

149151
/**
150152
*
151-
* @param {string} pdfFile - Path to PDF file.
152-
* @param {*} ocrFile
153+
* @param {string[]} files - Paths to input files.
153154
* @param {*} outputDir
154155
* @param {Object} options
155156
* @param {boolean} [options.robust]
156157
* @param {boolean} [options.conf]
157158
* @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
158159
* @param {number} [options.workers]
159160
*/
160-
export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('overlay', {
161-
pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
161+
export const overlay = async (files, outputDir, options) => (main('overlay', {
162+
files, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
162163
}));
163164

164165
/**
165166
*
166-
* @param {string} pdfFile - Path to PDF file.
167+
* @param {string[]} files - Paths to input files.
167168
* @param {Object} options
168169
* @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
169170
* @param {boolean} [options.hocr]
170171
* @param {number} [options.workers]
171172
*/
172-
export const recognize = async (pdfFile, options) => (main('recognize', {
173-
pdfFile, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers, hocr: options?.hocr,
173+
export const recognize = async (files, options) => (main('recognize', {
174+
files, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers, hocr: options?.hocr,
174175
}));
175176

176177
/**
177178
*
178-
* @param {string} pdfFile - Path to PDF file.
179+
* @param {string[]} files - Paths to input files.
179180
* @param {*} outputDir
180181
* @param {*} options
181-
* @returns
182182
*/
183-
export const debug = async (pdfFile, outputDir, options) => (main('debug', {
184-
pdfFile, outputDir, list: options?.list,
183+
export const debug = async (files, outputDir, options) => (main('debug', {
184+
files, outputDir, list: options?.list,
185185
}));

cli/scribe.js

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,15 @@ program
2020

2121
program
2222
.command('check')
23-
.argument('<pdf_file>', 'Input PDF file.')
24-
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
2523
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
24+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
2625
.description('Calculate confidence metric for OCR data by running Tesseract OCR and comparing results.')
2726
.action(checkCLI);
2827

2928
program
3029
.command('eval')
31-
.argument('<pdf_file>', 'Input PDF file.')
32-
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
3330
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
31+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
3432
.description('Evaluate internal OCR engine by recognizing document (provided PDF file), and comparing to ground truth (provided OCR file).')
3533
.action(evalInternalCLI);
3634

@@ -45,23 +43,22 @@ program
4543

4644
program
4745
.command('overlay')
48-
.argument('<pdf_file>', 'Input PDF file.')
49-
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
50-
.argument('[output_dir]', 'Directory for output file(s).', '.')
46+
.option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
5147
.option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
5248
.option('-c, --conf', 'Print average confidence metric for document.')
5349
.option('-r, --robust', 'Generate confidence metrics by running Tesseract OCR and comparing, rather than using confidence info in provided data.')
5450
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
51+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
5552
.description('Add OCR data to provided PDF file and save result as PDF.')
5653
.action(overlayCLI);
5754

5855
program
5956
.command('recognize')
60-
.argument('<pdf_file>', 'Input PDF file.')
6157
.description('Recognize text in PDF file using internal OCR engine.')
6258
.option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
6359
.option('-h, --hocr', 'Output .hocr intermediate data in addition to .pdf.')
6460
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
61+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
6562
.action(recognizeCLI);
6663

6764
program
@@ -73,9 +70,10 @@ program
7370

7471
program
7572
.command('debug')
76-
.argument('<pdf_file>', 'Input PDF file.')
73+
.option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
7774
.argument('[output_dir]', 'Directory for output file(s).', '.')
7875
.option('--list <items>', 'Comma separated list of visualizations to include.', (value) => value.split(','))
76+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
7977
.description('Generate and write Tesseract debugging images.')
8078
.action(debugCLI);
8179

tests/cli/cli.spec.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ describe('Check Node.js commands.', () => {
5252

5353
it('Should print confidence of Abbyy .xml file.', async () => {
5454
// Call the function
55-
await confCLI(path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml'));
55+
await confCLI([path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml')]);
5656

5757
// originalConsoleLog(consoleOutput);
5858

@@ -63,7 +63,7 @@ describe('Check Node.js commands.', () => {
6363
it('Should check contents of Abbyy .xml file.', async () => {
6464
// CLI equivalent: node cli/scribe.js check tests/assets/scribe_test_pdf1.pdf tests/assets/scribe_test_pdf1_abbyy.xml
6565
// Workers is set to 1 to avoid results changing based on the number of CPU cores due to the OCR engine learning.
66-
await checkCLI(path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml'), { workers: 1 });
66+
await checkCLI([path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml')], { workers: 1 });
6767

6868
// originalConsoleLog(consoleOutput);
6969

@@ -75,7 +75,7 @@ describe('Check Node.js commands.', () => {
7575
const tmpDir = await tmpUnique.get();
7676

7777
// Call the function
78-
await overlayCLI(path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml'), tmpDir, { vis: true });
78+
await overlayCLI([path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml')], { output: tmpDir, vis: true });
7979

8080
const outputPath = `${tmpDir}/scribe_test_pdf1_vis.pdf`;
8181

@@ -86,7 +86,7 @@ describe('Check Node.js commands.', () => {
8686
const tmpDir = await tmpUnique.get();
8787

8888
// Call the function
89-
await overlayCLI(path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml'), tmpDir, { conf: true, vis: true });
89+
await overlayCLI([path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml')], { output: tmpDir, conf: true, vis: true });
9090

9191
expect(consoleOutput).to.include('385 of 404');
9292

@@ -100,7 +100,7 @@ describe('Check Node.js commands.', () => {
100100
const tmpDir = await tmpUnique.get();
101101

102102
// Call the function
103-
await overlayCLI(path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml'), tmpDir, { robust: true, vis: true });
103+
await overlayCLI([path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml')], { output: tmpDir, robust: true, vis: true });
104104

105105
const outputPath = `${tmpDir}/scribe_test_pdf1_vis.pdf`;
106106

@@ -111,8 +111,8 @@ describe('Check Node.js commands.', () => {
111111
const tmpDir = await tmpUnique.get();
112112

113113
// Call the function
114-
await overlayCLI(path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml'), tmpDir, {
115-
robust: true, conf: true, vis: true, workers: 1,
114+
await overlayCLI([path.join(__dirname, '../assets/scribe_test_pdf1.pdf'), path.join(__dirname, '../assets/scribe_test_pdf1_abbyy.xml')], {
115+
output: tmpDir, robust: true, conf: true, vis: true, workers: 1,
116116
});
117117

118118
if (!/387 of 404/.test(consoleOutput)) originalConsoleLog(consoleOutput);

0 commit comments

Comments
 (0)