@@ -20,7 +20,7 @@ const calcFontMetrics = (fontI, fontSize) => {
20
20
if ( os2 . fsSelection >> 7 & 1 ) {
21
21
return {
22
22
fontBoundingBoxAscent : Math . round ( os2 . sTypoAscender * ( fontSize / unitsPerEm ) ) ,
23
- fontBoundingBoxDescent : Math . round ( os2 . sTypoDescender * ( fontSize / unitsPerEm ) ) ,
23
+ fontBoundingBoxDescent : Math . abs ( Math . round ( os2 . sTypoDescender * ( fontSize / unitsPerEm ) ) ) ,
24
24
} ;
25
25
}
26
26
@@ -49,31 +49,64 @@ const makeSmallCapsDivs = (text, fontSizeHTMLSmallCaps) => {
49
49
/**
50
50
* Convert an array of ocrPage objects to HTML.
51
51
*
52
- * @param {Array<OcrPage> } ocrCurrent -
53
- * @param {number } minpage - The first page to include in the document.
54
- * @param {number } maxpage - The last page to include in the document.
55
- * @param {boolean } reflowText - Remove line breaks within what appears to be the same paragraph.
56
- * @param {boolean } removeMargins - Remove the margins from the text.
57
- * @param {?Array<string> } wordIds - An array of word IDs to include in the document.
52
+ * @param {Object } params
53
+ * @param {Array<OcrPage> } params.ocrPages -
54
+ * @param {Array<ImageWrapper> } [params.images] -
55
+ * @param {number } [params.minpage=0] - The first page to include in the document.
56
+ * @param {number } [params.maxpage=-1] - The last page to include in the document.
57
+ * @param {boolean } [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
58
+ * @param {boolean } [params.removeMargins=false] - Remove the margins from the text.
59
+ * @param {?Array<string> } [params.wordIds] - An array of word IDs to include in the document.
58
60
* If omitted, all words are included.
59
61
*/
60
- export function writeHtml ( ocrCurrent , minpage = 0 , maxpage = - 1 , reflowText = false , removeMargins = false , wordIds = null ) {
62
+ export function writeHtml ( {
63
+ ocrPages, images, minpage = 0 , maxpage = - 1 , reflowText = false , removeMargins = false , wordIds = null ,
64
+ } ) {
61
65
const fontsUsed = new Set ( ) ;
62
66
63
- const pad = 5 ;
67
+ const enableOptSaved = FontCont . state . enableOpt ;
68
+ FontCont . state . enableOpt = false ;
69
+
70
+ if ( images && images . length === 0 ) images = undefined ;
71
+
72
+ // This does not work well yet, so hard-code it to false for now.
73
+ reflowText = false ;
64
74
65
75
let bodyStr = '<body>\n' ;
66
76
67
- if ( maxpage === - 1 ) maxpage = ocrCurrent . length - 1 ;
77
+ if ( maxpage === - 1 ) maxpage = ocrPages . length - 1 ;
68
78
69
79
let newLine = false ;
70
80
81
+ const activeLine = {
82
+ left : 0 ,
83
+ y1 : 0 ,
84
+ maxFontBoundingBoxAscentLine : 0 ,
85
+ bodyWordsStr : '' ,
86
+ } ;
87
+
88
+ const addLine = ( ) => {
89
+ if ( activeLine . bodyWordsStr !== '' ) {
90
+ const topHTML = Math . round ( ( activeLine . y1 - activeLine . maxFontBoundingBoxAscentLine ) * 1000 ) / 1000 ;
91
+ bodyStr += ` <div class="scribe-line" style="left:${ activeLine . left } px;top:${ topHTML } px;">\n` ;
92
+ bodyStr += activeLine . bodyWordsStr ;
93
+ bodyStr += ' </div>\n' ;
94
+ }
95
+ activeLine . bodyWordsStr = '' ;
96
+ activeLine . maxFontBoundingBoxAscentLine = 0 ;
97
+ activeLine . y1 = 0 ;
98
+ activeLine . left = 0 ;
99
+ } ;
100
+
71
101
let top = 0 ;
72
102
103
+ let firstPage = true ;
104
+
73
105
for ( let g = minpage ; g <= maxpage ; g ++ ) {
74
- if ( ! ocrCurrent [ g ] || ocrCurrent [ g ] . lines . length === 0 ) continue ;
106
+ // TODO: change this when an image is included.
107
+ if ( ! ocrPages [ g ] || ocrPages [ g ] . lines . length === 0 ) continue ;
75
108
76
- const pageObj = ocrCurrent [ g ] ;
109
+ const pageObj = ocrPages [ g ] ;
77
110
78
111
let minLeft = 0 ;
79
112
let minTop = 0 ;
@@ -89,7 +122,15 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
89
122
}
90
123
}
91
124
125
+ if ( ! firstPage ) bodyStr += '\n</div>\n' ;
126
+ firstPage = false ;
92
127
bodyStr += ` <div class="scribe-page" id="page${ g } " style="position:absolute;top:${ top } px;">\n` ;
128
+
129
+ const imageObj = images ? images [ g ] : null ;
130
+ if ( imageObj ) {
131
+ bodyStr += ` <img src="${ imageObj . src } ">\n` ;
132
+ }
133
+
93
134
if ( removeMargins ) {
94
135
top += Math . min ( ( maxBottom - minTop ) + 200 , pageMetricsArr [ g ] . dims . height + 10 ) ;
95
136
} else {
@@ -102,12 +143,15 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
102
143
}
103
144
104
145
let parCurrent = pageObj . lines [ 0 ] . par ;
146
+ let wordObjPrev = /** @type {?OcrWord } */ ( null ) ;
147
+ let rightSideBearingPrev = 0 ;
148
+ let charSpacingHTMLPrev = 0 ;
105
149
106
150
for ( let h = 0 ; h < pageObj . lines . length ; h ++ ) {
107
151
const lineObj = pageObj . lines [ h ] ;
108
152
109
153
if ( reflowText ) {
110
- if ( g > 0 && h === 0 || lineObj . par !== parCurrent ) newLine = true ;
154
+ if ( h === 0 || lineObj . par !== parCurrent ) newLine = true ;
111
155
parCurrent = lineObj . par ;
112
156
} else {
113
157
newLine = true ;
@@ -120,7 +164,20 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
120
164
if ( wordIds && ! wordIds . includes ( wordObj . id ) ) continue ;
121
165
122
166
if ( newLine ) {
123
- bodyStr += '\n' ;
167
+ wordObjPrev = null ;
168
+
169
+ addLine ( ) ;
170
+
171
+ const scale = 1 ;
172
+
173
+ const {
174
+ charSpacing, leftSideBearing, rightSideBearing, fontSize, charArr, advanceArr, kerningArr, font,
175
+ } = calcWordMetrics ( wordObj ) ;
176
+
177
+ activeLine . y1 = wordObj . line . bbox . bottom + wordObj . line . baseline [ 1 ] - minTop ;
178
+
179
+ activeLine . left = wordObj . bbox . left - minLeft ;
180
+ if ( wordObj . visualCoords ) activeLine . left -= leftSideBearing * scale ;
124
181
} else if ( h > 0 || g > 0 || i > 0 ) {
125
182
bodyStr += ' ' ;
126
183
}
@@ -130,6 +187,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
130
187
const scale = 1 ;
131
188
const angle = 0 ;
132
189
190
+ // HTML exports currently only use raw fonts, as the fonts are retrieved from a CDN.
133
191
const fontI = FontCont . getWordFont ( wordObj ) ;
134
192
fontsUsed . add ( fontI ) ;
135
193
@@ -141,27 +199,23 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
141
199
142
200
const charSpacingHTML = charSpacing * scale ;
143
201
144
- let x1 = wordObj . bbox . left - minLeft ;
145
202
const y1 = wordObj . line . bbox . bottom + wordObj . line . baseline [ 1 ] - minTop ;
146
203
147
- if ( wordObj . visualCoords ) x1 -= leftSideBearing * scale ;
148
-
149
204
const fontSizeHTML = fontSize * scale ;
150
205
151
206
const metrics = calcFontMetrics ( fontI , fontSizeHTML ) ;
152
207
153
208
const fontSizeHTMLSmallCaps = fontSize * scale * fontI . smallCapsMult ;
154
209
210
+ if ( metrics . fontBoundingBoxAscent > activeLine . maxFontBoundingBoxAscentLine ) {
211
+ activeLine . maxFontBoundingBoxAscentLine = metrics . fontBoundingBoxAscent ;
212
+ }
213
+
155
214
// Align with baseline
156
- const topHTML = Math . round ( ( y1 - metrics . fontBoundingBoxAscent + fontSizeHTML * 0.6 ) * 1000 ) / 1000 ;
215
+ const topHTML = Math . round ( ( y1 - metrics . fontBoundingBoxAscent ) * 1000 ) / 1000 ;
157
216
158
217
let styleStr = '' ;
159
218
160
- const topPadOffset = 5 * Math . sin ( angle * ( Math . PI / 180 ) ) ;
161
- const leftPadOffset = 5 * Math . cos ( angle * ( Math . PI / 180 ) ) ;
162
-
163
- styleStr += `left:${ x1 - leftPadOffset } px;` ;
164
- styleStr += `top:${ topHTML - topPadOffset } px;` ;
165
219
styleStr += `font-size:${ fontSizeHTML } px;` ;
166
220
styleStr += `font-family:${ fontI . fontFaceName } ;` ;
167
221
@@ -170,6 +224,18 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
170
224
styleStr += `transform:rotate(${ angle } deg);` ;
171
225
}
172
226
227
+ const { fill, opacity } = ocr . getWordFillOpacity ( wordObj , opt . displayMode ,
228
+ opt . confThreshMed , opt . confThreshHigh , opt . overlayOpacity ) ;
229
+
230
+ // Text with opacity 0 is not selectable, so we make it transparent instead.
231
+ if ( opacity === 0 ) {
232
+ styleStr += 'color:transparent;' ;
233
+ styleStr += 'opacity:1;' ;
234
+ } else {
235
+ styleStr += `color:${ fill } ;` ;
236
+ styleStr += `opacity:${ opacity } ;` ;
237
+ }
238
+
173
239
// We cannot make the text uppercase in the input field, as this would result in the text being saved as uppercase.
174
240
// Additionally, while there is a small-caps CSS property, it does not allow for customizing the size of the small caps.
175
241
// Therefore, we handle small caps by making all text print as uppercase using the `text-transform` CSS property,
@@ -182,28 +248,46 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
182
248
innerHTML = wordStr ;
183
249
}
184
250
251
+ let leftPad = 0 ;
252
+ if ( wordObjPrev ) {
253
+ let bearingAdj = 0 ;
254
+ if ( wordObj . visualCoords ) {
255
+ bearingAdj = leftSideBearing + rightSideBearingPrev ;
256
+ }
257
+
258
+ leftPad = ( wordObj . bbox . left - wordObjPrev . bbox . right - bearingAdj - charSpacingHTMLPrev ) / Math . cos ( angle ) ;
259
+ }
260
+
185
261
styleStr += `letter-spacing:${ charSpacingHTML } px;` ;
186
262
187
263
styleStr += `font-weight:${ fontI . fontFaceWeight } ;` ;
188
264
styleStr += `font-style:${ fontI . fontFaceStyle } ;` ;
265
+ styleStr += `padding-left:${ leftPad } px;` ;
189
266
190
267
// Line height must match the height of the font bounding box for the font metrics to be accurate.
191
268
styleStr += `line-height:${ metrics . fontBoundingBoxAscent + metrics . fontBoundingBoxDescent } px;` ;
192
269
193
- bodyStr += ` <span class="scribe-word" id="${ wordObj . id } " style="${ styleStr } ">${ innerHTML } </span>` ;
270
+ if ( wordObj . style . sup ) {
271
+ const supOffset = Math . round ( wordObj . line . bbox . bottom + wordObj . line . baseline [ 1 ] - wordObj . bbox . bottom ) ;
272
+ styleStr += `vertical-align:${ supOffset } px;` ;
273
+ }
274
+
275
+ activeLine . bodyWordsStr += ` <span class="scribe-word" id="${ wordObj . id } " style="${ styleStr } ">${ innerHTML } </span>\n` ;
276
+
277
+ wordObjPrev = wordObj ;
278
+ rightSideBearingPrev = rightSideBearing ;
279
+ charSpacingHTMLPrev = charSpacingHTML ;
194
280
}
195
281
}
196
282
283
+ addLine ( ) ;
197
284
bodyStr += '\n </div>\n' ;
198
285
199
286
opt . progressHandler ( { n : g , type : 'export' , info : { } } ) ;
200
287
}
201
288
202
289
let styleStr = '<style>\n .scribe-word {\n' ;
203
290
204
- styleStr += ' position:absolute;\n' ;
205
- styleStr += ` padding-left:${ pad } px;\n` ;
206
- styleStr += ` padding-right:${ pad } px;\n` ;
207
291
styleStr += ' z-index:1;\n' ;
208
292
styleStr += ' white-space:nowrap;\n' ;
209
293
if ( opt . kerning ) {
@@ -214,6 +298,12 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
214
298
215
299
styleStr += ' }\n' ;
216
300
301
+ styleStr += ' .scribe-line {\n' ;
302
+ styleStr += ' font-size:0px;\n' ;
303
+ styleStr += ' position:absolute;\n' ;
304
+ styleStr += ' white-space:nowrap;\n' ;
305
+ styleStr += ' }\n' ;
306
+
217
307
for ( const fontI of fontsUsed ) {
218
308
const cdnPath = 'https://cdn.jsdelivr.net/npm/[email protected] /fonts/all/' ;
219
309
let styleTitleCase = fontI . style . charAt ( 0 ) . toUpperCase ( ) + fontI . style . slice ( 1 ) . toLowerCase ( ) ;
@@ -233,7 +323,11 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
233
323
234
324
bodyStr += '</body>\n' ;
235
325
236
- const htmlStr = `<html>\n<head>\n${ styleStr } </head>\n${ bodyStr } </html>` ;
326
+ const metaStr = '<meta charset="UTF-8">\n' ;
327
+
328
+ const htmlStr = `<html>\n<head>\n${ metaStr } ${ styleStr } </head>\n${ bodyStr } </html>` ;
329
+
330
+ FontCont . state . enableOpt = enableOptSaved ;
237
331
238
332
return htmlStr ;
239
333
}
0 commit comments