Skip to content

Commit 0a65142

Browse files
committed
Updated CLI
1 parent f586b20 commit 0a65142

File tree

3 files changed

+16
-9
lines changed

3 files changed

+16
-9
lines changed

cli/cli.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,14 @@ export const evalInternalCLI = async (files, options) => {
5252

5353
/**
5454
*
55-
* @param {string} pdfFile - Path to PDF file.
55+
* @param {string} inputFile - Path to PDF file.
5656
* @param {?string} [outputDir='.'] - Output directory.
5757
* @param {Object} [options]
5858
* @param {"pdf" | "hocr" | "docx" | "xlsx" | "txt" | "text" | "html"} [options.format]
5959
* @param {boolean} [options.reflow]
6060
*/
61-
export const extractCLI = async (pdfFile, outputDir, options) => {
62-
await extract(pdfFile, outputDir, options);
61+
export const extractCLI = async (inputFile, outputDir, options) => {
62+
await extract(inputFile, outputDir, options);
6363
process.exitCode = 0;
6464
};
6565

cli/extract.js

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,34 @@ import fs from 'node:fs';
22
import path from 'node:path';
33
import scribe from '../scribe.js';
44

5+
// TODO: Consider whether this should exist and whether it should be combined into a larger CLI utility.
6+
// This was originally created to provide a simple interface to extract existing text from a PDF file,
7+
// however it now does other things, and this should likely be part of a larger `convert` utility.
8+
59
/**
610
*
7-
* @param {string} pdfFile - Path to PDF file.
11+
* @param {string} inputFile - Path to input file.
812
* @param {?string} [output='.'] - Output file or directory.
913
* @param {Object} [options]
1014
* @param {Parameters<typeof scribe.download>[0]} [options.format]
1115
* @param {boolean} [options.reflow]
1216
*/
13-
export const extract = async (pdfFile, output, options) => {
17+
export const extract = async (inputFile, output, options) => {
1418
const format = options?.format || 'txt';
1519

1620
output = output || '.';
1721
const outputDir = path.dirname(output);
18-
const outputFile = outputDir === output ? `${path.basename(pdfFile).replace(/\.\w{1,6}$/i, `.${format}`)}` : path.basename(output);
22+
const outputFile = outputDir === output ? `${path.basename(inputFile).replace(/\.\w{1,6}$/i, `.${format}`)}` : path.basename(output);
1923
const outputPath = `${outputDir}/${outputFile}`;
2024

2125
scribe.opt.reflow = true;
2226
scribe.opt.extractText = true;
27+
scribe.opt.displayMode = 'ebook';
2328

24-
await scribe.init();
25-
await scribe.importFiles([pdfFile]);
29+
// TODO: Fonts do not need to be loaded for .txt output, but are needed for .pdf output.
30+
// so a more robust implementation would consider the arguments and only load fonts if necessary.
31+
await scribe.init({ font: true });
32+
await scribe.importFiles([inputFile]);
2633

2734
if (outputDir) fs.mkdirSync(outputDir, { recursive: true });
2835

cli/scribe.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ program
3434

3535
program
3636
.command('extract')
37-
.argument('<pdf_file>', 'Input PDF file.')
37+
.argument('<input_file>', 'Input PDF file.')
3838
.argument('[output]', 'Output directory or file to save results.', '.')
3939
.addOption(new Option('-f, --format <ext>', 'Output format.').choices(['pdf', 'hocr', 'docx', 'xlsx', 'txt', 'text', 'html']).default('txt'))
4040
.option('-r, --reflow', 'Reflow text by combining lines into paragraphs.')

0 commit comments

Comments
 (0)