Skip to content

Commit c4bee94

Browse files
authored
Add first pass of Textract support (#45)
1 parent 0f4bc19 commit c4bee94

25 files changed

+402640
-18
lines changed

build-deno-compile.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@ mkdir -p build
1717
# Build for different platforms
1818
echo "Building for Linux x64..."
1919
deno compile --allow-sys --allow-read --allow-write --target x86_64-unknown-linux-gnu --output build/scribe-linux-x64 cli/scribe.js
20+
# deno compile --allow-sys --allow-read --allow-write --target x86_64-unknown-linux-gnu --output build/scribe-linux-x64 --include mupdf --include fonts --include js/worker cli/scribe.js
2021

2122
echo "Building for macOS x64..."
2223
deno compile --allow-sys --allow-read --allow-write --target x86_64-apple-darwin --output build/scribe-macos-x64 cli/scribe.js
24+
# deno compile --allow-sys --allow-read --allow-write --target x86_64-apple-darwin --output build/scribe-macos-x64 --include mupdf --include fonts --include js/worker cli/scribe.js
2325

2426
echo "Building for Windows x64..."
2527
deno compile --allow-sys --allow-read --allow-write --target x86_64-pc-windows-msvc --output build/scribe-windows-x64.exe cli/scribe.js
28+
# deno compile --allow-sys --allow-read --allow-write --target x86_64-pc-windows-msvc --output build/scribe-windows-x64.exe --include mupdf --include fonts --include js/worker cli/scribe.js
2629

2730
# Create checksums
2831
cd build

fonts/Dingbats.woff

21.5 KB
Binary file not shown.

js/export/writeHtml.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,8 @@ export function writeHtml({
137137
top += pageMetricsArr[g].dims.height + 10;
138138
}
139139

140-
if (reflowText) {
140+
// Do not overwrite paragraphs from Abbyy or Textract.
141+
if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) {
141142
const angle = pageMetricsArr[g].angle || 0;
142143
assignParagraphs(pageObj, angle);
143144
}

js/export/writeText.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
2626

2727
const pageObj = ocrCurrent[g];
2828

29-
if (reflowText) {
29+
// Do not overwrite paragraphs from Abbyy or Textract.
30+
if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) {
3031
const angle = pageMetricsArr[g].angle || 0;
3132
assignParagraphs(pageObj, angle);
3233
}

js/fontContainerMain.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,35 @@ export async function loadChiSimFont() {
223223
return chiReady;
224224
}
225225

226+
let dingbatsReadyRes;
227+
let dingbatsReady;
228+
229+
/**
230+
* Loads dingbats font. Returns early if already loaded.
231+
*/
232+
export async function loadDingbatsFont() {
233+
console.log('Loading Dingbats font');
234+
if (dingbatsReady) return dingbatsReady;
235+
236+
dingbatsReady = new Promise((resolve, reject) => {
237+
dingbatsReadyRes = resolve;
238+
});
239+
240+
let /** @type {Promise<ArrayBuffer>} */ dingbatsSrc;
241+
if (typeof process === 'undefined') {
242+
dingbatsSrc = fetch(new URL('../fonts/Dingbats.woff', import.meta.url)).then((res) => res.arrayBuffer());
243+
} else {
244+
const { readFile } = await import('node:fs/promises');
245+
dingbatsSrc = readFile(new URL('../fonts/Dingbats.woff', import.meta.url)).then((res) => res.buffer);
246+
}
247+
248+
FontCont.supp.dingbats = await loadFont('Dingbats', 'normal', 'sans', await dingbatsSrc, false);
249+
250+
dingbatsReadyRes();
251+
252+
return dingbatsReady;
253+
}
254+
226255
/**
227256
* Enable or disable font optimization settings.
228257
* This function is used rather than exposing the settings using the `opt` object, as these settings exist on the font container in both the main thread and the worker threads.

js/generalWorkerMain.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ export async function initGeneralWorker() {
9595
obj.convertPageHocr = wrap('convertPageHocr');
9696
obj.convertPageAbbyy = wrap('convertPageAbbyy');
9797
obj.convertPageStext = wrap('convertPageStext');
98+
obj.convertDocTextract = wrap('convertDocTextract');
9899

99100
obj.optimizeFont = wrap('optimizeFont');
100101

@@ -174,6 +175,12 @@ export class gs {
174175
*/
175176
static convertPageAbbyy = async (args) => (await gs.schedulerInner.addJob('convertPageAbbyy', args));
176177

178+
/**
179+
* @param {Parameters<typeof import('./import/convertDocTextract.js').convertDocTextract>[0]} args
180+
* @returns {ReturnType<typeof import('./import/convertDocTextract.js').convertDocTextract>}
181+
*/
182+
static convertDocTextract = async (args) => (await gs.schedulerInner.addJob('convertDocTextract', args));
183+
177184
/**
178185
* @param {Parameters<typeof import('./import/convertPageStext.js').convertPageStext>[0]} args
179186
* @returns {ReturnType<typeof import('./import/convertPageStext.js').convertPageStext>}

js/global.d.ts

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ declare global {
1010
sup: boolean;
1111
dropcap: boolean;
1212
};
13+
14+
// Strings representing supported sources of text.
15+
// `stext` indicates the text was extracted directly from a PDF using mupdf.
16+
type TextSource = null | 'tesseract' | 'textract' | 'abbyy' | 'stext' | 'hocr';
1317

1418
type FontState = {
1519
enableOpt: boolean;
@@ -176,6 +180,53 @@ declare global {
176180
type LayoutDataColumn = import("./objects/layoutObjects.js").LayoutDataColumn;
177181
type LayoutRegion = import("./objects/layoutObjects.js").LayoutRegion;
178182

183+
interface Point {
184+
x: number;
185+
y: number;
186+
}
187+
188+
interface Polygon {
189+
br: Point;
190+
bl: Point;
191+
tr: Point;
192+
tl: Point;
193+
}
194+
195+
interface TextractBoundingBox {
196+
Width: number;
197+
Height: number;
198+
Left: number;
199+
Top: number;
200+
}
201+
202+
interface TextractPoint {
203+
X: number;
204+
Y: number;
205+
}
206+
207+
interface TextractGeometry {
208+
BoundingBox: TextractBoundingBox;
209+
Polygon: TextractPoint[];
210+
RotationAngle: number;
211+
}
212+
213+
interface Relationship {
214+
Type: string;
215+
Ids: string[];
216+
}
217+
218+
interface TextractBlock {
219+
BlockType: "WORD" | "LINE" | "PAGE" | "KEY_VALUE_SET" | "CELL" | "MERGED_CELL" | "SELECTION_ELEMENT" | "TABLE";
220+
Confidence: number;
221+
Text: string;
222+
TextType: "PRINTED" | "HANDWRITING";
223+
Geometry: TextractGeometry;
224+
Id: string;
225+
Page?: number;
226+
Relationships?: Relationship[];
227+
}
228+
229+
179230
}
180231

181232
export { };

0 commit comments

Comments
 (0)