Skip to content

Commit 43e699e

Browse files
committed
Updated HTML export (still experimental); fixed misc bugs
1 parent 56e795f commit 43e699e

File tree

4 files changed

+154
-30
lines changed

4 files changed

+154
-30
lines changed

js/containers/app.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ export class opt {
3131

3232
static removeMargins = false;
3333

34+
static includeImages = false;
35+
3436
static pageBreaks = true;
3537

3638
/** @type {("invis"|"ebook"|"eval"|"proof")} */

js/export/export.js

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,35 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
188188
} else if (format === 'hocr') {
189189
content = writeHocr(ocrDownload, minPage, maxPage);
190190
} else if (format === 'html') {
191-
content = writeHtml(ocrDownload, minPage, maxPage, opt.reflow, opt.removeMargins);
191+
const images = /** @type {Array<ImageWrapper>} */ ([]);
192+
if (opt.includeImages) {
193+
const props = { rotated: opt.autoRotate, upscaled: false, colorMode: opt.colorMode };
194+
const binary = opt.colorMode === 'binary';
195+
196+
// An image could be rendered if either (1) binary is selected or (2) the input data is a PDF.
197+
// Otherwise, the images uploaded by the user are used.
198+
const renderImage = binary || inputData.pdfMode;
199+
200+
// Pre-render to benefit from parallel processing, since the loop below is synchronous.
201+
if (renderImage) await ImageCache.preRenderRange(minPage, maxPage, binary, props);
202+
203+
for (let i = minPage; i < maxPage + 1; i++) {
204+
/** @type {ImageWrapper} */
205+
let image;
206+
if (binary) {
207+
image = await ImageCache.getBinary(i, props);
208+
} else if (inputData.pdfMode) {
209+
image = await ImageCache.getNative(i, props);
210+
} else {
211+
image = await ImageCache.nativeSrc[i];
212+
}
213+
images.push(image);
214+
}
215+
}
216+
217+
content = writeHtml({
218+
ocrPages: ocrDownload, images, minpage: minPage, maxpage: maxPage, reflowText: opt.reflow, removeMargins: opt.removeMargins,
219+
});
192220
} else if (format === 'txt') {
193221
content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false);
194222
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.

js/export/writeHtml.js

Lines changed: 122 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ const calcFontMetrics = (fontI, fontSize) => {
2020
if (os2.fsSelection >> 7 & 1) {
2121
return {
2222
fontBoundingBoxAscent: Math.round(os2.sTypoAscender * (fontSize / unitsPerEm)),
23-
fontBoundingBoxDescent: Math.round(os2.sTypoDescender * (fontSize / unitsPerEm)),
23+
fontBoundingBoxDescent: Math.abs(Math.round(os2.sTypoDescender * (fontSize / unitsPerEm))),
2424
};
2525
}
2626

@@ -49,31 +49,64 @@ const makeSmallCapsDivs = (text, fontSizeHTMLSmallCaps) => {
4949
/**
5050
* Convert an array of ocrPage objects to HTML.
5151
*
52-
* @param {Array<OcrPage>} ocrCurrent -
53-
* @param {number} minpage - The first page to include in the document.
54-
* @param {number} maxpage - The last page to include in the document.
55-
* @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph.
56-
* @param {boolean} removeMargins - Remove the margins from the text.
57-
* @param {?Array<string>} wordIds - An array of word IDs to include in the document.
52+
* @param {Object} params
53+
* @param {Array<OcrPage>} params.ocrPages -
54+
* @param {Array<ImageWrapper>} [params.images] -
55+
* @param {number} [params.minpage=0] - The first page to include in the document.
56+
* @param {number} [params.maxpage=-1] - The last page to include in the document.
57+
* @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
58+
* @param {boolean} [params.removeMargins=false] - Remove the margins from the text.
59+
* @param {?Array<string>} [params.wordIds] - An array of word IDs to include in the document.
5860
* If omitted, all words are included.
5961
*/
60-
export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, removeMargins = false, wordIds = null) {
62+
export function writeHtml({
63+
ocrPages, images, minpage = 0, maxpage = -1, reflowText = false, removeMargins = false, wordIds = null,
64+
}) {
6165
const fontsUsed = new Set();
6266

63-
const pad = 5;
67+
const enableOptSaved = FontCont.state.enableOpt;
68+
FontCont.state.enableOpt = false;
69+
70+
if (images && images.length === 0) images = undefined;
71+
72+
// This does not work well yet, so hard-code it to false for now.
73+
reflowText = false;
6474

6575
let bodyStr = '<body>\n';
6676

67-
if (maxpage === -1) maxpage = ocrCurrent.length - 1;
77+
if (maxpage === -1) maxpage = ocrPages.length - 1;
6878

6979
let newLine = false;
7080

81+
const activeLine = {
82+
left: 0,
83+
y1: 0,
84+
maxFontBoundingBoxAscentLine: 0,
85+
bodyWordsStr: '',
86+
};
87+
88+
const addLine = () => {
89+
if (activeLine.bodyWordsStr !== '') {
90+
const topHTML = Math.round((activeLine.y1 - activeLine.maxFontBoundingBoxAscentLine) * 1000) / 1000;
91+
bodyStr += ` <div class="scribe-line" style="left:${activeLine.left}px;top:${topHTML}px;">\n`;
92+
bodyStr += activeLine.bodyWordsStr;
93+
bodyStr += ' </div>\n';
94+
}
95+
activeLine.bodyWordsStr = '';
96+
activeLine.maxFontBoundingBoxAscentLine = 0;
97+
activeLine.y1 = 0;
98+
activeLine.left = 0;
99+
};
100+
71101
let top = 0;
72102

103+
let firstPage = true;
104+
73105
for (let g = minpage; g <= maxpage; g++) {
74-
if (!ocrCurrent[g] || ocrCurrent[g].lines.length === 0) continue;
106+
// TODO: change this when an image is included.
107+
if (!ocrPages[g] || ocrPages[g].lines.length === 0) continue;
75108

76-
const pageObj = ocrCurrent[g];
109+
const pageObj = ocrPages[g];
77110

78111
let minLeft = 0;
79112
let minTop = 0;
@@ -89,7 +122,15 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
89122
}
90123
}
91124

125+
if (!firstPage) bodyStr += '\n</div>\n';
126+
firstPage = false;
92127
bodyStr += ` <div class="scribe-page" id="page${g}" style="position:absolute;top:${top}px;">\n`;
128+
129+
const imageObj = images ? images[g] : null;
130+
if (imageObj) {
131+
bodyStr += ` <img src="${imageObj.src}">\n`;
132+
}
133+
93134
if (removeMargins) {
94135
top += Math.min((maxBottom - minTop) + 200, pageMetricsArr[g].dims.height + 10);
95136
} else {
@@ -102,12 +143,15 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
102143
}
103144

104145
let parCurrent = pageObj.lines[0].par;
146+
let wordObjPrev = /** @type {?OcrWord} */ (null);
147+
let rightSideBearingPrev = 0;
148+
let charSpacingHTMLPrev = 0;
105149

106150
for (let h = 0; h < pageObj.lines.length; h++) {
107151
const lineObj = pageObj.lines[h];
108152

109153
if (reflowText) {
110-
if (g > 0 && h === 0 || lineObj.par !== parCurrent) newLine = true;
154+
if (h === 0 || lineObj.par !== parCurrent) newLine = true;
111155
parCurrent = lineObj.par;
112156
} else {
113157
newLine = true;
@@ -120,7 +164,20 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
120164
if (wordIds && !wordIds.includes(wordObj.id)) continue;
121165

122166
if (newLine) {
123-
bodyStr += '\n';
167+
wordObjPrev = null;
168+
169+
addLine();
170+
171+
const scale = 1;
172+
173+
const {
174+
charSpacing, leftSideBearing, rightSideBearing, fontSize, charArr, advanceArr, kerningArr, font,
175+
} = calcWordMetrics(wordObj);
176+
177+
activeLine.y1 = wordObj.line.bbox.bottom + wordObj.line.baseline[1] - minTop;
178+
179+
activeLine.left = wordObj.bbox.left - minLeft;
180+
if (wordObj.visualCoords) activeLine.left -= leftSideBearing * scale;
124181
} else if (h > 0 || g > 0 || i > 0) {
125182
bodyStr += ' ';
126183
}
@@ -130,6 +187,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
130187
const scale = 1;
131188
const angle = 0;
132189

190+
// HTML exports currently only use raw fonts, as the fonts are retrieved from a CDN.
133191
const fontI = FontCont.getWordFont(wordObj);
134192
fontsUsed.add(fontI);
135193

@@ -141,27 +199,23 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
141199

142200
const charSpacingHTML = charSpacing * scale;
143201

144-
let x1 = wordObj.bbox.left - minLeft;
145202
const y1 = wordObj.line.bbox.bottom + wordObj.line.baseline[1] - minTop;
146203

147-
if (wordObj.visualCoords) x1 -= leftSideBearing * scale;
148-
149204
const fontSizeHTML = fontSize * scale;
150205

151206
const metrics = calcFontMetrics(fontI, fontSizeHTML);
152207

153208
const fontSizeHTMLSmallCaps = fontSize * scale * fontI.smallCapsMult;
154209

210+
if (metrics.fontBoundingBoxAscent > activeLine.maxFontBoundingBoxAscentLine) {
211+
activeLine.maxFontBoundingBoxAscentLine = metrics.fontBoundingBoxAscent;
212+
}
213+
155214
// Align with baseline
156-
const topHTML = Math.round((y1 - metrics.fontBoundingBoxAscent + fontSizeHTML * 0.6) * 1000) / 1000;
215+
const topHTML = Math.round((y1 - metrics.fontBoundingBoxAscent) * 1000) / 1000;
157216

158217
let styleStr = '';
159218

160-
const topPadOffset = 5 * Math.sin(angle * (Math.PI / 180));
161-
const leftPadOffset = 5 * Math.cos(angle * (Math.PI / 180));
162-
163-
styleStr += `left:${x1 - leftPadOffset}px;`;
164-
styleStr += `top:${topHTML - topPadOffset}px;`;
165219
styleStr += `font-size:${fontSizeHTML}px;`;
166220
styleStr += `font-family:${fontI.fontFaceName};`;
167221

@@ -170,6 +224,18 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
170224
styleStr += `transform:rotate(${angle}deg);`;
171225
}
172226

227+
const { fill, opacity } = ocr.getWordFillOpacity(wordObj, opt.displayMode,
228+
opt.confThreshMed, opt.confThreshHigh, opt.overlayOpacity);
229+
230+
// Text with opacity 0 is not selectable, so we make it transparent instead.
231+
if (opacity === 0) {
232+
styleStr += 'color:transparent;';
233+
styleStr += 'opacity:1;';
234+
} else {
235+
styleStr += `color:${fill};`;
236+
styleStr += `opacity:${opacity};`;
237+
}
238+
173239
// We cannot make the text uppercase in the input field, as this would result in the text being saved as uppercase.
174240
// Additionally, while there is a small-caps CSS property, it does not allow for customizing the size of the small caps.
175241
// Therefore, we handle small caps by making all text print as uppercase using the `text-transform` CSS property,
@@ -182,28 +248,46 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
182248
innerHTML = wordStr;
183249
}
184250

251+
let leftPad = 0;
252+
if (wordObjPrev) {
253+
let bearingAdj = 0;
254+
if (wordObj.visualCoords) {
255+
bearingAdj = leftSideBearing + rightSideBearingPrev;
256+
}
257+
258+
leftPad = (wordObj.bbox.left - wordObjPrev.bbox.right - bearingAdj - charSpacingHTMLPrev) / Math.cos(angle);
259+
}
260+
185261
styleStr += `letter-spacing:${charSpacingHTML}px;`;
186262

187263
styleStr += `font-weight:${fontI.fontFaceWeight};`;
188264
styleStr += `font-style:${fontI.fontFaceStyle};`;
265+
styleStr += `padding-left:${leftPad}px;`;
189266

190267
// Line height must match the height of the font bounding box for the font metrics to be accurate.
191268
styleStr += `line-height:${metrics.fontBoundingBoxAscent + metrics.fontBoundingBoxDescent}px;`;
192269

193-
bodyStr += ` <span class="scribe-word" id="${wordObj.id}" style="${styleStr}">${innerHTML}</span>`;
270+
if (wordObj.style.sup) {
271+
const supOffset = Math.round(wordObj.line.bbox.bottom + wordObj.line.baseline[1] - wordObj.bbox.bottom);
272+
styleStr += `vertical-align:${supOffset}px;`;
273+
}
274+
275+
activeLine.bodyWordsStr += ` <span class="scribe-word" id="${wordObj.id}" style="${styleStr}">${innerHTML}</span>\n`;
276+
277+
wordObjPrev = wordObj;
278+
rightSideBearingPrev = rightSideBearing;
279+
charSpacingHTMLPrev = charSpacingHTML;
194280
}
195281
}
196282

283+
addLine();
197284
bodyStr += '\n </div>\n';
198285

199286
opt.progressHandler({ n: g, type: 'export', info: { } });
200287
}
201288

202289
let styleStr = '<style>\n .scribe-word {\n';
203290

204-
styleStr += ' position:absolute;\n';
205-
styleStr += ` padding-left:${pad}px;\n`;
206-
styleStr += ` padding-right:${pad}px;\n`;
207291
styleStr += ' z-index:1;\n';
208292
styleStr += ' white-space:nowrap;\n';
209293
if (opt.kerning) {
@@ -214,6 +298,12 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
214298

215299
styleStr += ' }\n';
216300

301+
styleStr += ' .scribe-line {\n';
302+
styleStr += ' font-size:0px;\n';
303+
styleStr += ' position:absolute;\n';
304+
styleStr += ' white-space:nowrap;\n';
305+
styleStr += ' }\n';
306+
217307
for (const fontI of fontsUsed) {
218308
const cdnPath = 'https://cdn.jsdelivr.net/npm/[email protected]/fonts/all/';
219309
let styleTitleCase = fontI.style.charAt(0).toUpperCase() + fontI.style.slice(1).toLowerCase();
@@ -233,7 +323,11 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
233323

234324
bodyStr += '</body>\n';
235325

236-
const htmlStr = `<html>\n<head>\n${styleStr}</head>\n${bodyStr}</html>`;
326+
const metaStr = '<meta charset="UTF-8">\n';
327+
328+
const htmlStr = `<html>\n<head>\n${metaStr}${styleStr}</head>\n${bodyStr}</html>`;
329+
330+
FontCont.state.enableOpt = enableOptSaved;
237331

238332
return htmlStr;
239333
}

js/utils/miscUtils.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ export function objectAssignDefined(target, ...sources) {
399399
// Sans/serif lookup for common font families. These should not include spaces or underscores--multi-word font names should be concatenated.
400400
// Fonts that should not be added (both Sans and Serif variants):
401401
// DejaVu
402-
const serifFonts = ['SerifDefault', 'Baskerville', 'Bookman', 'C059', 'Calibri', 'Cambria', 'Century', 'Courier', 'Garamond', 'Georgia',
402+
const serifFonts = ['SerifDefault', 'Baskerville', 'Bembo', 'Bodoni', 'Bookman', 'C059', 'Calibri', 'Cambria', 'Century', 'Cheltenham', 'Courier', 'Garamond', 'Georgia',
403403
'LucidaBright', 'Minion', 'NimbusMono', 'Optima', 'P052', 'Palatino', 'Times'];
404404
const sansFonts = ['SansDefault', 'Avenir', 'Arial', 'Calibri', 'Candara', 'Carlito', 'Comic', 'Franklin', 'Futura', 'Gotham', 'Gothic',
405405
'Helvetica', 'Impact', 'Interstate', 'Myriad', 'Tahoma', 'Trebuchet', 'Univers', 'Verdana'];

0 commit comments

Comments
 (0)