@@ -4,11 +4,12 @@ import { calcWordCharMetrics } from '../utils/fontUtils.js';
4
4
import { FontCont } from '../containers/fontContainer.js' ;
5
5
6
6
const FONT_FAMILY = 'Times New Roman' ;
7
- const FONT_SIZE = 12 ;
8
- const CHAR_SPACING = 1 ;
7
+ const FONT_SIZE = 14 ;
8
+ const CHAR_SPACING = 0 ;
9
+ const WORD_SPACING = 0 ;
9
10
const LINE_HEIGHT = 14.4 ;
10
- const ASCENDER_HEIGHT = 9.6 ;
11
- const DESCENDER_HEIGHT = 2.4 ;
11
+ const MARGIN_VERTICAL = 30 ;
12
+ const MARGIN_HORIZONTAL = 20 ;
12
13
13
14
/** @type {?opentype.Font } */
14
15
let fontOpentype = null ;
@@ -19,15 +20,24 @@ let fontOpentype = null;
19
20
* @param {number } size
20
21
* @param {opentype.Font } font
21
22
*/
22
- function getTextAdvance ( text , size , font ) {
23
+ function getTextWidth ( text , size , font ) {
23
24
const { advanceArr, kerningArr } = calcWordCharMetrics ( text , font ) ;
24
25
25
26
const advanceTotal = advanceArr . reduce ( ( a , b ) => a + b , 0 ) ;
26
27
const kerningTotal = kerningArr . reduce ( ( a , b ) => a + b , 0 ) ;
27
28
28
- const wordWidth1 = ( advanceTotal + kerningTotal ) * ( size / font . unitsPerEm ) ;
29
+ const wordLastGlyphMetrics = font . charToGlyph ( text . at ( - 1 ) ) . getMetrics ( ) ;
30
+ const wordFirstGlyphMetrics = font . charToGlyph ( text [ 0 ] ) . getMetrics ( ) ;
31
+
32
+ // The `leftSideBearing`/`rightSideBearing`/ numbers reported by Opentype.js are not accurate for mono-spaced fonts, so `xMin`/`xMax` are used instead.
33
+ const wordLeftBearing = wordFirstGlyphMetrics . xMin || 0 ;
34
+ const lastGlyphMax = wordLastGlyphMetrics . xMax || 0 ;
35
+ const wordRightBearing = advanceArr [ advanceArr . length - 1 ] - lastGlyphMax ;
36
+
37
+ const wordWidth1 = ( advanceTotal + kerningTotal - wordLeftBearing - wordRightBearing ) ;
38
+ const wordWidth1Px = wordWidth1 * ( size / font . unitsPerEm ) ;
29
39
const spacingTotalPx = ( text . length - 1 ) * CHAR_SPACING ;
30
- const wordWidth = wordWidth1 + spacingTotalPx ;
40
+ const wordWidth = wordWidth1Px + spacingTotalPx ;
31
41
32
42
return wordWidth ;
33
43
}
@@ -76,6 +86,9 @@ export async function convertPageText({ textStr, pageDims = null }) {
76
86
fontOpentype = ( await FontCont . getFont ( { font : FONT_FAMILY } ) ) . opentype ;
77
87
}
78
88
89
+ const ASCENDER_HEIGHT = fontOpentype . ascender * ( FONT_SIZE / fontOpentype . unitsPerEm ) ;
90
+ const DESCENDER_HEIGHT = fontOpentype . descender * ( FONT_SIZE / fontOpentype . unitsPerEm ) ;
91
+
79
92
const lines = textStr . split ( / \r ? \n / ) ;
80
93
81
94
if ( ! pageDims ) {
@@ -97,39 +110,38 @@ export async function convertPageText({ textStr, pageDims = null }) {
97
110
98
111
let tablesPage = new LayoutDataTablePage ( 0 ) ;
99
112
const pagesOut = [ { pageObj, dataTables : tablesPage } ] ;
100
- const margin = 20 ;
101
- const availableWidth = pageDims . width - margin * 2 ;
113
+ const availableWidth = pageDims . width - MARGIN_HORIZONTAL * 2 ;
102
114
103
- let currentY = margin + ASCENDER_HEIGHT ;
115
+ let currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2 ;
104
116
105
117
for ( let lineIndex = 0 ; lineIndex < lines . length ; lineIndex ++ ) {
106
118
const lineText = lines [ lineIndex ] ;
107
119
108
120
if ( lineText . length === 0 || lineText . trim ( ) . length === 0 ) {
109
121
currentY += LINE_HEIGHT ;
110
- if ( currentY + DESCENDER_HEIGHT > pageDims . height - margin ) {
122
+ if ( currentY + FONT_SIZE > pageDims . height - MARGIN_VERTICAL ) {
111
123
pageIndex ++ ;
112
124
const newPage = new ocr . OcrPage ( pageIndex , pageDims ) ;
113
125
newPage . textSource = 'text' ;
114
126
const newTables = new LayoutDataTablePage ( 0 ) ;
115
127
pagesOut . push ( { pageObj : newPage , dataTables : newTables } ) ;
116
128
pageObj = newPage ;
117
129
tablesPage = newTables ;
118
- currentY = margin + ASCENDER_HEIGHT ;
130
+ currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2 ;
119
131
}
120
132
continue ;
121
133
}
122
134
123
135
const wordTokens = splitIntoWords ( lineText ) ;
124
136
125
137
const parLines = [ ] ;
126
- let parRight = margin ;
138
+ let parRight = MARGIN_HORIZONTAL ;
127
139
128
140
for ( let idx = 0 ; idx < wordTokens . length ; ) {
129
- if ( currentY + DESCENDER_HEIGHT > pageDims . height - margin ) {
141
+ if ( currentY + FONT_SIZE > pageDims . height - MARGIN_VERTICAL ) {
130
142
if ( parLines . length > 0 ) {
131
143
const parBbox = {
132
- left : margin ,
144
+ left : MARGIN_HORIZONTAL ,
133
145
top : parLines [ 0 ] . bbox . top ,
134
146
right : parRight ,
135
147
bottom : parLines [ parLines . length - 1 ] . bbox . bottom ,
@@ -139,7 +151,7 @@ export async function convertPageText({ textStr, pageDims = null }) {
139
151
for ( const ln of parLines ) ln . par = parObj ;
140
152
pageObj . pars . push ( parObj ) ;
141
153
parLines . length = 0 ;
142
- parRight = margin ;
154
+ parRight = MARGIN_HORIZONTAL ;
143
155
}
144
156
pageIndex ++ ;
145
157
const newPage = new ocr . OcrPage ( pageIndex , pageDims ) ;
@@ -148,34 +160,35 @@ export async function convertPageText({ textStr, pageDims = null }) {
148
160
pagesOut . push ( { pageObj : newPage , dataTables : newTables } ) ;
149
161
pageObj = newPage ;
150
162
tablesPage = newTables ;
151
- currentY = margin + ASCENDER_HEIGHT ;
163
+ currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2 ;
152
164
}
153
165
154
166
const baseline = [ 0 , DESCENDER_HEIGHT ] ;
155
167
const lineTop = Math . round ( currentY - ASCENDER_HEIGHT ) ;
156
168
const lineBottom = Math . round ( currentY + DESCENDER_HEIGHT ) ;
157
169
158
- let currentX = margin ;
170
+ let currentX = MARGIN_HORIZONTAL ;
159
171
let widthSoFar = 0 ;
160
172
161
173
const lineBbox = {
162
- left : margin ,
174
+ left : MARGIN_HORIZONTAL ,
163
175
top : lineTop ,
164
- right : margin ,
176
+ right : MARGIN_HORIZONTAL ,
165
177
bottom : lineBottom ,
166
178
} ;
167
179
const lineObj = new ocr . OcrLine (
168
180
pageObj ,
169
181
lineBbox ,
170
182
baseline ,
171
183
ASCENDER_HEIGHT ,
172
- ASCENDER_HEIGHT - DESCENDER_HEIGHT ,
184
+ null ,
173
185
) ;
174
186
175
187
let lastConsumed = idx ;
176
188
for ( let j = idx ; j < wordTokens . length ; j ++ ) {
177
189
const tok = wordTokens [ j ] ;
178
- const tokWidth = getTextAdvance ( tok . text , FONT_SIZE , fontOpentype ) ;
190
+ let tokWidth = getTextWidth ( tok . text , FONT_SIZE , fontOpentype ) ;
191
+ if ( tok . isWhitespace ) tokWidth += WORD_SPACING ;
179
192
180
193
if ( tok . isWhitespace ) {
181
194
if ( lineObj . words . length === 0 ) {
@@ -218,7 +231,7 @@ export async function convertPageText({ textStr, pageDims = null }) {
218
231
if ( lineObj . words . length === 0 ) {
219
232
const nextTok = wordTokens [ idx ] ;
220
233
if ( nextTok && ! nextTok . isWhitespace ) {
221
- const tokWidth = getTextAdvance ( nextTok . text , FONT_SIZE , fontOpentype ) ;
234
+ const tokWidth = getTextWidth ( nextTok . text , FONT_SIZE , fontOpentype ) ;
222
235
const wordBbox = {
223
236
left : Math . round ( currentX ) ,
224
237
top : lineTop ,
@@ -258,7 +271,7 @@ export async function convertPageText({ textStr, pageDims = null }) {
258
271
259
272
if ( parLines . length > 0 ) {
260
273
const parBbox = {
261
- left : margin ,
274
+ left : MARGIN_HORIZONTAL ,
262
275
top : parLines [ 0 ] . bbox . top ,
263
276
right : parRight ,
264
277
bottom : parLines [ parLines . length - 1 ] . bbox . bottom ,
0 commit comments