Skip to content

Commit f586b20

Browse files
committed
Added text import function
1 parent a5a5d8d commit f586b20

File tree

8 files changed

+338
-17
lines changed

8 files changed

+338
-17
lines changed

js/generalWorkerMain.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ export async function initGeneralWorker() {
9696
obj.convertPageAbbyy = wrap('convertPageAbbyy');
9797
obj.convertPageStext = wrap('convertPageStext');
9898
obj.convertDocTextract = wrap('convertDocTextract');
99+
obj.convertPageText = wrap('convertPageText');
99100

100101
obj.optimizeFont = wrap('optimizeFont');
101102

@@ -187,6 +188,12 @@ export class gs {
187188
*/
188189
static convertPageStext = async (args) => (await gs.schedulerInner.addJob('convertPageStext', args));
189190

191+
/**
192+
* @param {Parameters<typeof import('./import/convertPageText.js').convertPageText>[0]} args
193+
* @returns {ReturnType<typeof import('./import/convertPageText.js').convertPageText>}
194+
*/
195+
static convertPageText = async (args) => (await gs.schedulerInner.addJob('convertPageText', args));
196+
190197
/**
191198
* @param {Parameters<typeof import('./worker/optimizeFontModule.js').optimizeFont>[0]} args
192199
* @returns {ReturnType<typeof import('./worker/optimizeFontModule.js').optimizeFont>}

js/global.d.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ declare global {
1313

1414
// Strings representing supported sources of text.
1515
// `stext` indicates the text was extracted directly from a PDF using mupdf.
16-
type TextSource = null | 'tesseract' | 'textract' | 'abbyy' | 'stext' | 'hocr';
17-
16+
type TextSource = null | 'tesseract' | 'textract' | 'abbyy' | 'stext' | 'hocr' | 'text';
17+
1818
type FontState = {
1919
enableOpt: boolean;
2020
forceOpt: boolean;
@@ -24,7 +24,7 @@ declare global {
2424
sansDefaultName: string;
2525
glyphSet: null | 'latin' | 'all';
2626
charMetrics: { [key: string]: CharMetricsFamily };
27-
}
27+
}
2828

2929
type ScribeSaveData = {
3030
ocr: OcrPage[];
@@ -33,8 +33,8 @@ declare global {
3333
layoutDataTables: LayoutDataTablePage[];
3434
}
3535

36-
type StyleLookup = ('normal'|'bold'|'italic'|'boldItalic');
37-
36+
type StyleLookup = ('normal' | 'bold' | 'italic' | 'boldItalic');
37+
3838
// OCR objects
3939
type OcrPage = import("./objects/ocrObjects.js").OcrPage;
4040
type OcrLine = import("./objects/ocrObjects.js").OcrLine;
@@ -216,7 +216,7 @@ declare global {
216216
}
217217

218218
interface TextractBlock {
219-
BlockType: "WORD" | "LINE" | "PAGE" | "KEY_VALUE_SET" | "CELL" | "MERGED_CELL" | "SELECTION_ELEMENT" | "TABLE";
219+
BlockType: "WORD" | "LINE" | "PAGE" | "KEY_VALUE_SET" | "CELL" | "MERGED_CELL" | "SELECTION_ELEMENT" | "TABLE";
220220
Confidence: number;
221221
Text: string;
222222
TextType: "PRINTED" | "HANDWRITING";

js/import/convertPageText.js

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import ocr from '../objects/ocrObjects.js';
2+
import { LayoutDataTablePage } from '../objects/layoutObjects.js';
3+
import { calcWordCharMetrics } from '../utils/fontUtils.js';
4+
import { FontCont } from '../containers/fontContainer.js';
5+
6+
const FONT_FAMILY = 'Times New Roman';
7+
const FONT_SIZE = 12;
8+
const CHAR_SPACING = 1;
9+
const LINE_HEIGHT = 14.4;
10+
const ASCENDER_HEIGHT = 9.6;
11+
const DESCENDER_HEIGHT = 2.4;
12+
13+
/** @type {?opentype.Font} */
14+
let fontOpentype = null;
15+
16+
/**
17+
* Calculates the advance of a string in pixels.
18+
* @param {string} text
19+
* @param {number} size
20+
* @param {opentype.Font} font
21+
*/
22+
function getTextAdvance(text, size, font) {
23+
const { advanceArr, kerningArr } = calcWordCharMetrics(text, font);
24+
25+
const advanceTotal = advanceArr.reduce((a, b) => a + b, 0);
26+
const kerningTotal = kerningArr.reduce((a, b) => a + b, 0);
27+
28+
const wordWidth1 = (advanceTotal + kerningTotal) * (size / font.unitsPerEm);
29+
const spacingTotalPx = (text.length - 1) * CHAR_SPACING;
30+
const wordWidth = wordWidth1 + spacingTotalPx;
31+
32+
return wordWidth;
33+
}
34+
35+
/**
36+
* Splits text into words, preserving whitespace information
37+
* @param {string} line - The line of text
38+
* @returns {Array<{text: string, isWhitespace: boolean}>} Array of word objects
39+
*/
40+
function splitIntoWords(line) {
41+
const words = [];
42+
let currentWord = '';
43+
let isInWhitespace = false;
44+
45+
for (let i = 0; i < line.length; i++) {
46+
const char = line[i];
47+
const charIsWhitespace = /\s/.test(char);
48+
49+
if (charIsWhitespace !== isInWhitespace) {
50+
if (currentWord.length > 0) {
51+
words.push({ text: currentWord, isWhitespace: isInWhitespace });
52+
currentWord = '';
53+
}
54+
isInWhitespace = charIsWhitespace;
55+
}
56+
currentWord += char;
57+
}
58+
59+
if (currentWord.length > 0) {
60+
words.push({ text: currentWord, isWhitespace: isInWhitespace });
61+
}
62+
63+
return words;
64+
}
65+
66+
/**
67+
* Convert raw text to internal OCR format
68+
* @param {Object} params
69+
* @param {string} params.textStr - Raw text content
70+
* @param {?{width: number, height: number}} [params.pageDims] - Page dimensions (will be calculated if not provided)
71+
*/
72+
export async function convertPageText({ textStr, pageDims = null }) {
73+
let pageIndex = 0;
74+
75+
if (!fontOpentype) {
76+
fontOpentype = (await FontCont.getFont({ font: FONT_FAMILY })).opentype;
77+
}
78+
79+
const lines = textStr.split(/\r?\n/);
80+
81+
if (!pageDims) {
82+
pageDims = { width: 612, height: 792 }; // Default to letter size (8.5 x 11 inches)
83+
}
84+
85+
let pageObj = new ocr.OcrPage(pageIndex, pageDims);
86+
pageObj.textSource = 'text';
87+
88+
if (lines.length === 0 || lines.every((line) => line.trim() === '')) {
89+
const warn = { char: 'char_error' };
90+
return {
91+
pageObj,
92+
charMetricsObj: {},
93+
dataTables: new LayoutDataTablePage(0),
94+
warn,
95+
};
96+
}
97+
98+
let tablesPage = new LayoutDataTablePage(0);
99+
const pagesOut = [{ pageObj, dataTables: tablesPage }];
100+
const margin = 20;
101+
const availableWidth = pageDims.width - margin * 2;
102+
103+
let currentY = margin + ASCENDER_HEIGHT;
104+
105+
for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) {
106+
const lineText = lines[lineIndex];
107+
108+
if (lineText.length === 0 || lineText.trim().length === 0) {
109+
currentY += LINE_HEIGHT;
110+
if (currentY + DESCENDER_HEIGHT > pageDims.height - margin) {
111+
pageIndex++;
112+
const newPage = new ocr.OcrPage(pageIndex, pageDims);
113+
newPage.textSource = 'text';
114+
const newTables = new LayoutDataTablePage(0);
115+
pagesOut.push({ pageObj: newPage, dataTables: newTables });
116+
pageObj = newPage;
117+
tablesPage = newTables;
118+
currentY = margin + ASCENDER_HEIGHT;
119+
}
120+
continue;
121+
}
122+
123+
const wordTokens = splitIntoWords(lineText);
124+
125+
const parLines = [];
126+
let parRight = margin;
127+
128+
for (let idx = 0; idx < wordTokens.length;) {
129+
if (currentY + DESCENDER_HEIGHT > pageDims.height - margin) {
130+
if (parLines.length > 0) {
131+
const parBbox = {
132+
left: margin,
133+
top: parLines[0].bbox.top,
134+
right: parRight,
135+
bottom: parLines[parLines.length - 1].bbox.bottom,
136+
};
137+
const parObj = new ocr.OcrPar(pageObj, parBbox);
138+
parObj.lines = parLines;
139+
for (const ln of parLines) ln.par = parObj;
140+
pageObj.pars.push(parObj);
141+
parLines.length = 0;
142+
parRight = margin;
143+
}
144+
pageIndex++;
145+
const newPage = new ocr.OcrPage(pageIndex, pageDims);
146+
newPage.textSource = 'text';
147+
const newTables = new LayoutDataTablePage(0);
148+
pagesOut.push({ pageObj: newPage, dataTables: newTables });
149+
pageObj = newPage;
150+
tablesPage = newTables;
151+
currentY = margin + ASCENDER_HEIGHT;
152+
}
153+
154+
const baseline = [0, DESCENDER_HEIGHT];
155+
const lineTop = Math.round(currentY - ASCENDER_HEIGHT);
156+
const lineBottom = Math.round(currentY + DESCENDER_HEIGHT);
157+
158+
let currentX = margin;
159+
let widthSoFar = 0;
160+
161+
const lineBbox = {
162+
left: margin,
163+
top: lineTop,
164+
right: margin,
165+
bottom: lineBottom,
166+
};
167+
const lineObj = new ocr.OcrLine(
168+
pageObj,
169+
lineBbox,
170+
baseline,
171+
ASCENDER_HEIGHT,
172+
ASCENDER_HEIGHT - DESCENDER_HEIGHT,
173+
);
174+
175+
let lastConsumed = idx;
176+
for (let j = idx; j < wordTokens.length; j++) {
177+
const tok = wordTokens[j];
178+
const tokWidth = getTextAdvance(tok.text, FONT_SIZE, fontOpentype);
179+
180+
if (tok.isWhitespace) {
181+
if (lineObj.words.length === 0) {
182+
// leading whitespace allowed if it fits
183+
if (widthSoFar + tokWidth > availableWidth) break;
184+
currentX += tokWidth;
185+
widthSoFar += tokWidth;
186+
lastConsumed = j + 1;
187+
} else {
188+
// trailing/middle whitespace (allowed even if it exceeds width)
189+
currentX += tokWidth;
190+
widthSoFar += tokWidth;
191+
lastConsumed = j + 1;
192+
}
193+
} else {
194+
if (lineObj.words.length > 0 && widthSoFar + tokWidth > availableWidth) {
195+
// wrap before this word
196+
break;
197+
}
198+
// place the word
199+
const wordBbox = {
200+
left: Math.round(currentX),
201+
top: lineTop,
202+
right: Math.round(currentX + tokWidth),
203+
bottom: lineBottom,
204+
};
205+
const wordId = `word_${pageIndex + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`;
206+
const wordObj = new ocr.OcrWord(lineObj, tok.text, wordBbox, wordId);
207+
wordObj.conf = 100;
208+
wordObj.style.font = FONT_FAMILY;
209+
lineObj.words.push(wordObj);
210+
211+
currentX += tokWidth;
212+
widthSoFar += tokWidth;
213+
lastConsumed = j + 1;
214+
}
215+
}
216+
217+
// Extreme edge case: force place a long word when nothing fit and next token is a non-whitespace word
218+
if (lineObj.words.length === 0) {
219+
const nextTok = wordTokens[idx];
220+
if (nextTok && !nextTok.isWhitespace) {
221+
const tokWidth = getTextAdvance(nextTok.text, FONT_SIZE, fontOpentype);
222+
const wordBbox = {
223+
left: Math.round(currentX),
224+
top: lineTop,
225+
right: Math.round(currentX + tokWidth),
226+
bottom: lineBottom,
227+
};
228+
const wordId = `word_${pageIndex + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`;
229+
const wordObj = new ocr.OcrWord(lineObj, nextTok.text, wordBbox, wordId);
230+
wordObj.conf = 100;
231+
wordObj.style.font = FONT_FAMILY;
232+
lineObj.words.push(wordObj);
233+
currentX += tokWidth;
234+
widthSoFar += tokWidth;
235+
lastConsumed = idx + 1;
236+
} else {
237+
// Can't place oversized leading whitespace; stop processing this paragraph
238+
break;
239+
}
240+
}
241+
242+
if (lineObj.words.length > 0) {
243+
lineObj.bbox = {
244+
left: lineObj.words[0].bbox.left,
245+
top: lineTop,
246+
right: Math.round(currentX),
247+
bottom: lineBottom,
248+
};
249+
250+
pageObj.lines.push(lineObj);
251+
parLines.push(lineObj);
252+
parRight = Math.max(parRight, lineObj.bbox.right);
253+
254+
currentY += LINE_HEIGHT;
255+
idx = lastConsumed;
256+
}
257+
}
258+
259+
if (parLines.length > 0) {
260+
const parBbox = {
261+
left: margin,
262+
top: parLines[0].bbox.top,
263+
right: parRight,
264+
bottom: parLines[parLines.length - 1].bbox.bottom,
265+
};
266+
const parObj = new ocr.OcrPar(pageObj, parBbox);
267+
parObj.lines = parLines;
268+
for (const ln of parLines) ln.par = parObj;
269+
pageObj.pars.push(parObj);
270+
}
271+
}
272+
273+
pageObj.angle = 0;
274+
275+
return pagesOut;
276+
}

js/import/import.js

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ export async function sortInputFiles(files) {
158158
if (['png', 'jpeg', 'jpg'].includes(fileExt)) {
159159
imageFilesAll.push(file);
160160
// All .gz files are assumed to be OCR data (xml) since all other file types can be compressed already
161-
} else if (['hocr', 'xml', 'html', 'gz', 'stext', 'json'].includes(fileExt)) {
161+
} else if (['hocr', 'xml', 'html', 'gz', 'stext', 'json', 'txt'].includes(fileExt)) {
162162
ocrFilesAll.push(file);
163163
} else if (['scribe'].includes(fileExt)) {
164164
scribeFilesAll.push(file);
@@ -352,6 +352,7 @@ export async function importFiles(files) {
352352
let abbyyMode = false;
353353
let textractMode = false;
354354
let reimportHocrMode = false;
355+
let textMode = false;
355356

356357
if (inputData.pdfMode) {
357358
const pdfFile = pdfFiles[0];
@@ -437,6 +438,7 @@ export async function importFiles(files) {
437438

438439
stextMode = ocrData.stextMode;
439440
textractMode = ocrData.textractMode;
441+
textMode = ocrData.textMode;
440442
}
441443

442444
let pageCountOcr = ocrAllRaw.active?.length || ocrAll.active?.length || 0;
@@ -491,11 +493,12 @@ export async function importFiles(files) {
491493
}
492494

493495
if (xmlModeImport) {
494-
/** @type {("hocr" | "abbyy" | "stext" | "textract")} */
496+
/** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */
495497
let format = 'hocr';
496498
if (abbyyMode) format = 'abbyy';
497499
if (stextMode) format = 'stext';
498500
if (textractMode) format = 'textract';
501+
if (textMode) format = 'text';
499502

500503
// Process HOCR using web worker, reading from file first if that has not been done already
501504
await convertOCR(ocrAllRaw.active, true, format, oemName, reimportHocrMode, pageMetricsArr).then(async () => {
@@ -549,11 +552,12 @@ export async function importFilesSupp(files, ocrName) {
549552
opt.warningHandler(warningHTML);
550553
}
551554

552-
/** @type {("hocr" | "abbyy" | "stext" | "textract")} */
555+
/** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */
553556
let format = 'hocr';
554557
if (ocrData.abbyyMode) format = 'abbyy';
555558
if (ocrData.stextMode) format = 'stext';
556559
if (ocrData.textractMode) format = 'textract';
560+
if (ocrData.textMode) format = 'text';
557561

558562
await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode);
559563
}

0 commit comments

Comments
 (0)