Skip to content

Commit 7ff55c0

Browse files
authored
Improved import and export support; misc changes (#49)
1 parent 0fc036e commit 7ff55c0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+12967
-247
lines changed

cli/main.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ async function main(func, params) {
103103
const debugDir = `${outputDir}/${outputStem}_debug`;
104104
fs.mkdirSync(debugDir, { recursive: true });
105105
const outputPathCsv = `${debugDir}/_debug.csv`;
106-
scribe.utils.writeDebugCsv(scribe.data.ocr.active, outputPathCsv);
106+
scribe.utils.writeDebugCsv({ pages: scribe.data.ocr.active, fileName: outputPathCsv });
107107

108108
scribe.utils.dumpDebugImages(debugDir);
109109
scribe.utils.dumpHOCR(debugDir);

js/clear.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import {
55
layoutRegions,
66
ocrAll,
77
ocrAllRaw,
8-
pageMetricsArr,
8+
pageMetricsAll,
99
} from './containers/dataContainer.js';
1010
import { FontCont } from './containers/fontContainer.js';
1111
import { ImageCache } from './containers/imageContainer.js';
@@ -19,7 +19,7 @@ export function clearData() {
1919
ocrAllRaw.active = [];
2020
layoutRegions.pages.length = 0;
2121
layoutDataTables.pages.length = 0;
22-
pageMetricsArr.length = 0;
22+
pageMetricsAll.length = 0;
2323
convertPageWarn.length = 0;
2424
ImageCache.clear();
2525
FontCont.clear();

js/containers/dataContainer.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ export const ocrAll = { active: [] };
6363
export const ocrAllRaw = { active: [] };
6464

6565
/** @type {Array<PageMetrics>} */
66-
export const pageMetricsArr = [];
66+
export const pageMetricsAll = [];
6767

6868
/**
6969
* Class that stores various debug data.

js/containers/imageContainer.js

Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@ import {
55
import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';
66

77
import { updateFontContWorkerMain } from '../fontContainerMain.js';
8-
import { pageMetricsArr } from './dataContainer.js';
8+
import { pageMetricsAll } from './dataContainer.js';
99
import {
1010
FontCont,
1111
FontContainerFont,
1212
loadOpentype,
1313
} from './fontContainer.js';
1414

1515
import { gs } from '../generalWorkerMain.js';
16-
import { imageUtils } from '../objects/imageObjects.js';
16+
import { imageUtils, ImageWrapper } from '../objects/imageObjects.js';
1717
import { range } from '../utils/miscUtils.js';
1818
import { opt } from './app.js';
1919

@@ -42,32 +42,6 @@ export class MuPDFScheduler {
4242
}
4343
}
4444

45-
export class ImageWrapper {
46-
/**
47-
* @param {number} n - Page number
48-
* @param {string} imageStr - Base-64 encoded image string. Should start with "data:image/png" or "data:image/jpeg".
49-
* @param {string} colorMode - Color mode ("color", "gray", or "binary").
50-
* @param {boolean} rotated - Whether image has been rotated.
51-
* @param {boolean} upscaled - Whether image has been upscaled.
52-
*
53-
* All properties of this object must be serializable, as ImageWrapper objects are sent between threads.
54-
* This means that no promises can be used.
55-
*/
56-
constructor(n, imageStr, colorMode, rotated = false, upscaled = false) {
57-
this.n = n;
58-
this.src = imageStr;
59-
const format0 = imageStr.match(/^data:image\/(png|jpeg)/)?.[1];
60-
if (!format0 || !['png', 'jpeg'].includes(format0)) throw new Error(`Invalid image format: ${format0}`);
61-
this.format = format0;
62-
this._dims = null;
63-
this.rotated = rotated;
64-
this.upscaled = upscaled;
65-
this.colorMode = colorMode;
66-
/** @type {?ImageBitmap} */
67-
this.imageBitmap = null;
68-
}
69-
}
70-
7145
/**
7246
* @typedef {Object} ImageProperties
7347
* @property {boolean} [rotated]
@@ -126,7 +100,7 @@ export class ImageCache {
126100
colorMode = color ? 'color' : 'gray';
127101
}
128102

129-
let pageAngle = pageMetricsArr[n].angle || 0;
103+
let pageAngle = pageMetricsAll[n].angle || 0;
130104
if (Math.abs(pageAngle) < 0.05) pageAngle = 0;
131105

132106
// If no preference is specified for rotation, default to true.
@@ -213,7 +187,7 @@ export class ImageCache {
213187
if (ImageCache.inputModes.image) {
214188
return ImageCache.nativeSrc[n];
215189
} if (ImageCache.inputModes.pdf) {
216-
const pageMetrics = pageMetricsArr[n];
190+
const pageMetrics = pageMetricsAll[n];
217191
const targetWidth = pageMetrics.dims.width;
218192
const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
219193
const muPDFScheduler = await ImageCache.getMuPDFScheduler();
@@ -232,7 +206,7 @@ export class ImageCache {
232206
* @param {boolean} [saveNativeImage=true] - Whether the native image should be saved.
233207
*/
234208
static transformImage = async (inputImage, n, props, saveNativeImage = true) => {
235-
let pageAngle = pageMetricsArr[n].angle || 0;
209+
let pageAngle = pageMetricsAll[n].angle || 0;
236210
if (Math.abs(pageAngle) < 0.05) pageAngle = 0;
237211

238212
// If no preference is specified for rotation, default to true.
@@ -245,8 +219,8 @@ export class ImageCache {
245219
await gs.getGeneralScheduler();
246220

247221
const resPromise = (async () => {
248-
// Wait for non-rotated version before replacing with promise
249-
if (typeof process === 'undefined') await gs.initTesseract({ anyOk: true });
222+
// Wait for non-rotated version before replacing with promise
223+
await gs.initTesseract({ anyOk: true });
250224
return gs.recognize({
251225
image: inputImage.src,
252226
options: { rotateRadians: angleArg, upscale: upscaleArg },
@@ -280,7 +254,7 @@ export class ImageCache {
280254
return { native: undefined, binary: undefined };
281255
}
282256

283-
const significantRotation = Math.abs(pageMetricsArr[n].angle || 0) > 0.05;
257+
const significantRotation = Math.abs(pageMetricsAll[n].angle || 0) > 0.05;
284258

285259
const newNative = !ImageCache.native[n] || !imageUtils.compatible(ImageCache.nativeProps[n], props, significantRotation);
286260
const newBinary = !nativeOnly && (!ImageCache.binary[n] || !imageUtils.compatible(ImageCache.binaryProps[n], props, significantRotation));
@@ -426,7 +400,7 @@ export class ImageCache {
426400

427401
ImageCache.pdfDims300.forEach((x, i) => {
428402
const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
429-
pageMetricsArr[i] = new PageMetrics(pageDims);
403+
pageMetricsAll[i] = new PageMetrics(pageDims);
430404
});
431405

432406
// WIP: Extract fonts embedded in PDFs.

js/coordinates.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// Image Coordinate Space: coordinate space of a particular image
44
// Canvas Coordinate Space: coordinate space of canvas, used for user interactions
55

6-
import { pageMetricsArr } from './containers/dataContainer.js';
6+
import { pageMetricsAll } from './containers/dataContainer.js';
77
import { ImageCache } from './containers/imageContainer.js';
88

99
/**
@@ -27,7 +27,7 @@ function rotateBoundingBox(boundingBox, rotateAngle, n) {
2727
let angleAdjXRect = 0;
2828
let angleAdjYRect = 0;
2929

30-
const pageDims = pageMetricsArr[n].dims;
30+
const pageDims = pageMetricsAll[n].dims;
3131

3232
const sinAngle = Math.sin(rotateAngle * (Math.PI / 180));
3333
const cosAngle = Math.cos(rotateAngle * (Math.PI / 180));
@@ -103,7 +103,7 @@ async function ocrToImage(ocrCoords, n, binary = false) {
103103

104104
if (imageN.rotated) {
105105
// Otherwise, we must also account for rotation applied by the canvas
106-
const rotateAngle = (pageMetricsArr[n].angle || 0) * -1;
106+
const rotateAngle = (pageMetricsAll[n].angle || 0) * -1;
107107

108108
rotateBoundingBox(ocrCoords, rotateAngle, n);
109109
}

js/debug.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { opt } from './containers/app.js';
2-
import { pageMetricsArr } from './containers/dataContainer.js';
2+
import { pageMetricsAll } from './containers/dataContainer.js';
33
import { ImageCache } from './containers/imageContainer.js';
44
import { gs } from './generalWorkerMain.js';
55
import { loadImageElem } from './utils/imageUtils.js';
@@ -125,7 +125,7 @@ export async function renderPageStatic(page) {
125125
const res = gs.renderPageStaticImp({
126126
page,
127127
image,
128-
angle: pageMetricsArr[page.n].angle,
128+
angle: pageMetricsAll[page.n].angle,
129129
});
130130

131131
return res;

js/export/export.js

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import { inputData, opt } from '../containers/app.js';
22
import {
3-
layoutDataTables, layoutRegions, ocrAll, pageMetricsArr,
3+
layoutDataTables, layoutRegions, ocrAll, pageMetricsAll,
44
} from '../containers/dataContainer.js';
55
import { ImageCache } from '../containers/imageContainer.js';
66
import { reorderOcrPage } from '../modifyOCR.js';
77
import { saveAs } from '../utils/miscUtils.js';
8-
import { writePdf } from './writePdf.js';
8+
import { writePdf } from './pdf/writePdf.js';
99
import { writeHocr } from './writeHocr.js';
1010
import { writeText } from './writeText.js';
1111
import { writeHtml } from './writeHtml.js';
@@ -45,8 +45,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
4545
const dimsLimit = { width: -1, height: -1 };
4646
if (opt.standardizePageSize) {
4747
for (let i = minPage; i <= maxPage; i++) {
48-
dimsLimit.height = Math.max(dimsLimit.height, pageMetricsArr[i].dims.height);
49-
dimsLimit.width = Math.max(dimsLimit.width, pageMetricsArr[i].dims.width);
48+
dimsLimit.height = Math.max(dimsLimit.height, pageMetricsAll[i].dims.height);
49+
dimsLimit.width = Math.max(dimsLimit.width, pageMetricsAll[i].dims.width);
5050
}
5151
}
5252

@@ -58,10 +58,30 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
5858

5959
const rotateText = !rotateBackground;
6060

61+
const includeImages = false;
62+
/** @type {ImageWrapper[]} */
63+
let images = [];
64+
if (includeImages) {
65+
images = await Promise.all(ImageCache.nativeSrc);
66+
}
67+
6168
// Page sizes should not be standardized at this step, as the overlayText/overlayTextImage functions will perform this,
6269
// and assume that the overlay PDF is the same size as the input images.
63-
const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, rotateText, rotateBackground,
64-
{ width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100);
70+
const pdfStr = await writePdf({
71+
ocrArr: ocrDownload,
72+
pageMetricsArr: pageMetricsAll,
73+
minpage: minPage,
74+
maxpage: maxPage,
75+
textMode: opt.displayMode,
76+
rotateText,
77+
rotateBackground,
78+
dimsLimit: { width: -1, height: -1 },
79+
confThreshHigh: opt.confThreshHigh,
80+
confThreshMed: opt.confThreshMed,
81+
proofOpacity: opt.overlayOpacity / 100,
82+
images,
83+
includeImages,
84+
});
6585

6686
const enc = new TextEncoder();
6787
const pdfEnc = enc.encode(pdfStr);
@@ -121,7 +141,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
121141

122142
await w.convertImageStart({ humanReadable: opt.humanReadablePDF });
123143
for (let i = minPage; i < maxPage + 1; i++) {
124-
/** @type {import('../containers/imageContainer.js').ImageWrapper} */
144+
/** @type {ImageWrapper} */
125145
let image;
126146
if (binary) {
127147
image = await ImageCache.getBinary(i, props);
@@ -134,7 +154,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
134154
// Angle the PDF viewer is instructed to rotated the image by.
135155
// This method is currently only used when rotation is needed but the user's (unrotated) source images are being used.
136156
// If the images are being rendered, then rotation is expected to be applied within the rendering process.
137-
const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsArr[i].angle || 0) * -1 : 0;
157+
const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsAll[i].angle || 0) * -1 : 0;
138158

139159
await w.convertImageAddPage({
140160
image: image.src, i, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, angle: angleImagePdf,
@@ -157,8 +177,19 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
157177

158178
w.freeDocument(pdfOverlay);
159179
} else {
160-
const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
161-
opt.overlayOpacity / 100);
180+
const pdfStr = await writePdf({
181+
ocrArr: ocrDownload,
182+
pageMetricsArr: pageMetricsAll,
183+
minpage: minPage,
184+
maxpage: maxPage,
185+
textMode: opt.displayMode,
186+
rotateText: false,
187+
rotateBackground: true,
188+
dimsLimit,
189+
confThreshHigh: opt.confThreshHigh,
190+
confThreshMed: opt.confThreshMed,
191+
proofOpacity: opt.overlayOpacity / 100,
192+
});
162193

163194
// The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
164195
// This is because muPDF cleans up the PDF we made in the previous step, including:
@@ -186,7 +217,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
186217
w.freeDocument(pdf);
187218
}
188219
} else if (format === 'hocr') {
189-
content = writeHocr(ocrDownload, minPage, maxPage);
220+
content = writeHocr({ ocrData: ocrDownload, minValue: minPage, maxValue: maxPage });
190221
} else if (format === 'html') {
191222
const images = /** @type {Array<ImageWrapper>} */ ([]);
192223
if (opt.includeImages) {
@@ -218,18 +249,29 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
218249
ocrPages: ocrDownload, images, minpage: minPage, maxpage: maxPage, reflowText: opt.reflow, removeMargins: opt.removeMargins,
219250
});
220251
} else if (format === 'txt') {
221-
content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false);
252+
content = writeText({
253+
ocrCurrent: ocrDownload,
254+
minpage: minPage,
255+
maxpage: maxPage,
256+
reflowText: opt.reflow,
257+
docxMode: false,
258+
});
222259
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
223260
// @ts-ignore
224261
} else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'docx') {
225262
// Less common export formats are loaded dynamically to reduce initial load time.
226263
const writeDocx = (await import('./writeDocx.js')).writeDocx;
227-
content = await writeDocx(ocrDownload, minPage, maxPage);
264+
content = await writeDocx({ hocrCurrent: ocrDownload, minpage: minPage, maxpage: maxPage });
228265
// @ts-ignore
229266
} else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'xlsx') {
230267
// Less common export formats are loaded dynamically to reduce initial load time.
231268
const writeXlsx = (await import('./writeTabular.js')).writeXlsx;
232-
content = await writeXlsx(ocrDownload, layoutDataTables.pages, minPage, maxPage);
269+
content = await writeXlsx({
270+
ocrPageArr: ocrDownload,
271+
layoutPageArr: layoutDataTables.pages,
272+
minpage: minPage,
273+
maxpage: maxPage,
274+
});
233275
} else if (format === 'scribe') {
234276
const data = {
235277
ocr: removeCircularRefsOcr(ocrDownload),

js/export/exportDebugCsv.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@ export const convertToCsv = (data) => {
3939

4040
/**
4141
*
42-
* @param {Array<OcrPage>} pages
43-
* @param {string} fileName
42+
* @param {Object} params
43+
* @param {Array<OcrPage>} params.pages
44+
* @param {string} params.fileName
4445
* @returns
4546
*/
46-
export const writeDebugCsv = (pages, fileName) => {
47+
export const writeDebugCsv = ({ pages, fileName }) => {
4748
let csvStr = '';
4849

4950
for (let i = 0; i < pages.length; i++) {

0 commit comments

Comments
 (0)