|
| 1 | +import ocr from '../objects/ocrObjects.js'; |
| 2 | +import { LayoutDataTablePage } from '../objects/layoutObjects.js'; |
| 3 | +import { calcWordCharMetrics } from '../utils/fontUtils.js'; |
| 4 | +import { FontCont } from '../containers/fontContainer.js'; |
| 5 | + |
| 6 | +const FONT_FAMILY = 'Times New Roman'; |
| 7 | +const FONT_SIZE = 12; |
| 8 | +const CHAR_SPACING = 1; |
| 9 | +const LINE_HEIGHT = 14.4; |
| 10 | +const ASCENDER_HEIGHT = 9.6; |
| 11 | +const DESCENDER_HEIGHT = 2.4; |
| 12 | + |
| 13 | +/** @type {?opentype.Font} */ |
| 14 | +let fontOpentype = null; |
| 15 | + |
| 16 | +/** |
| 17 | + * Calculates the advance of a string in pixels. |
| 18 | + * @param {string} text |
| 19 | + * @param {number} size |
| 20 | + * @param {opentype.Font} font |
| 21 | + */ |
| 22 | +function getTextAdvance(text, size, font) { |
| 23 | + const { advanceArr, kerningArr } = calcWordCharMetrics(text, font); |
| 24 | + |
| 25 | + const advanceTotal = advanceArr.reduce((a, b) => a + b, 0); |
| 26 | + const kerningTotal = kerningArr.reduce((a, b) => a + b, 0); |
| 27 | + |
| 28 | + const wordWidth1 = (advanceTotal + kerningTotal) * (size / font.unitsPerEm); |
| 29 | + const spacingTotalPx = (text.length - 1) * CHAR_SPACING; |
| 30 | + const wordWidth = wordWidth1 + spacingTotalPx; |
| 31 | + |
| 32 | + return wordWidth; |
| 33 | +} |
| 34 | + |
| 35 | +/** |
| 36 | + * Splits text into words, preserving whitespace information |
| 37 | + * @param {string} line - The line of text |
| 38 | + * @returns {Array<{text: string, isWhitespace: boolean}>} Array of word objects |
| 39 | + */ |
| 40 | +function splitIntoWords(line) { |
| 41 | + const words = []; |
| 42 | + let currentWord = ''; |
| 43 | + let isInWhitespace = false; |
| 44 | + |
| 45 | + for (let i = 0; i < line.length; i++) { |
| 46 | + const char = line[i]; |
| 47 | + const charIsWhitespace = /\s/.test(char); |
| 48 | + |
| 49 | + if (charIsWhitespace !== isInWhitespace) { |
| 50 | + if (currentWord.length > 0) { |
| 51 | + words.push({ text: currentWord, isWhitespace: isInWhitespace }); |
| 52 | + currentWord = ''; |
| 53 | + } |
| 54 | + isInWhitespace = charIsWhitespace; |
| 55 | + } |
| 56 | + currentWord += char; |
| 57 | + } |
| 58 | + |
| 59 | + if (currentWord.length > 0) { |
| 60 | + words.push({ text: currentWord, isWhitespace: isInWhitespace }); |
| 61 | + } |
| 62 | + |
| 63 | + return words; |
| 64 | +} |
| 65 | + |
| 66 | +/** |
| 67 | + * Convert raw text to internal OCR format |
| 68 | + * @param {Object} params |
| 69 | + * @param {string} params.textStr - Raw text content |
| 70 | + * @param {?{width: number, height: number}} [params.pageDims] - Page dimensions (will be calculated if not provided) |
| 71 | + */ |
| 72 | +export async function convertPageText({ textStr, pageDims = null }) { |
| 73 | + let pageIndex = 0; |
| 74 | + |
| 75 | + if (!fontOpentype) { |
| 76 | + fontOpentype = (await FontCont.getFont({ font: FONT_FAMILY })).opentype; |
| 77 | + } |
| 78 | + |
| 79 | + const lines = textStr.split(/\r?\n/); |
| 80 | + |
| 81 | + if (!pageDims) { |
| 82 | + pageDims = { width: 612, height: 792 }; // Default to letter size (8.5 x 11 inches) |
| 83 | + } |
| 84 | + |
| 85 | + let pageObj = new ocr.OcrPage(pageIndex, pageDims); |
| 86 | + pageObj.textSource = 'text'; |
| 87 | + |
| 88 | + if (lines.length === 0 || lines.every((line) => line.trim() === '')) { |
| 89 | + const warn = { char: 'char_error' }; |
| 90 | + return { |
| 91 | + pageObj, |
| 92 | + charMetricsObj: {}, |
| 93 | + dataTables: new LayoutDataTablePage(0), |
| 94 | + warn, |
| 95 | + }; |
| 96 | + } |
| 97 | + |
| 98 | + let tablesPage = new LayoutDataTablePage(0); |
| 99 | + const pagesOut = [{ pageObj, dataTables: tablesPage }]; |
| 100 | + const margin = 20; |
| 101 | + const availableWidth = pageDims.width - margin * 2; |
| 102 | + |
| 103 | + let currentY = margin + ASCENDER_HEIGHT; |
| 104 | + |
| 105 | + for (let lineIndex = 0; lineIndex < lines.length; lineIndex++) { |
| 106 | + const lineText = lines[lineIndex]; |
| 107 | + |
| 108 | + if (lineText.length === 0 || lineText.trim().length === 0) { |
| 109 | + currentY += LINE_HEIGHT; |
| 110 | + if (currentY + DESCENDER_HEIGHT > pageDims.height - margin) { |
| 111 | + pageIndex++; |
| 112 | + const newPage = new ocr.OcrPage(pageIndex, pageDims); |
| 113 | + newPage.textSource = 'text'; |
| 114 | + const newTables = new LayoutDataTablePage(0); |
| 115 | + pagesOut.push({ pageObj: newPage, dataTables: newTables }); |
| 116 | + pageObj = newPage; |
| 117 | + tablesPage = newTables; |
| 118 | + currentY = margin + ASCENDER_HEIGHT; |
| 119 | + } |
| 120 | + continue; |
| 121 | + } |
| 122 | + |
| 123 | + const wordTokens = splitIntoWords(lineText); |
| 124 | + |
| 125 | + const parLines = []; |
| 126 | + let parRight = margin; |
| 127 | + |
| 128 | + for (let idx = 0; idx < wordTokens.length;) { |
| 129 | + if (currentY + DESCENDER_HEIGHT > pageDims.height - margin) { |
| 130 | + if (parLines.length > 0) { |
| 131 | + const parBbox = { |
| 132 | + left: margin, |
| 133 | + top: parLines[0].bbox.top, |
| 134 | + right: parRight, |
| 135 | + bottom: parLines[parLines.length - 1].bbox.bottom, |
| 136 | + }; |
| 137 | + const parObj = new ocr.OcrPar(pageObj, parBbox); |
| 138 | + parObj.lines = parLines; |
| 139 | + for (const ln of parLines) ln.par = parObj; |
| 140 | + pageObj.pars.push(parObj); |
| 141 | + parLines.length = 0; |
| 142 | + parRight = margin; |
| 143 | + } |
| 144 | + pageIndex++; |
| 145 | + const newPage = new ocr.OcrPage(pageIndex, pageDims); |
| 146 | + newPage.textSource = 'text'; |
| 147 | + const newTables = new LayoutDataTablePage(0); |
| 148 | + pagesOut.push({ pageObj: newPage, dataTables: newTables }); |
| 149 | + pageObj = newPage; |
| 150 | + tablesPage = newTables; |
| 151 | + currentY = margin + ASCENDER_HEIGHT; |
| 152 | + } |
| 153 | + |
| 154 | + const baseline = [0, DESCENDER_HEIGHT]; |
| 155 | + const lineTop = Math.round(currentY - ASCENDER_HEIGHT); |
| 156 | + const lineBottom = Math.round(currentY + DESCENDER_HEIGHT); |
| 157 | + |
| 158 | + let currentX = margin; |
| 159 | + let widthSoFar = 0; |
| 160 | + |
| 161 | + const lineBbox = { |
| 162 | + left: margin, |
| 163 | + top: lineTop, |
| 164 | + right: margin, |
| 165 | + bottom: lineBottom, |
| 166 | + }; |
| 167 | + const lineObj = new ocr.OcrLine( |
| 168 | + pageObj, |
| 169 | + lineBbox, |
| 170 | + baseline, |
| 171 | + ASCENDER_HEIGHT, |
| 172 | + ASCENDER_HEIGHT - DESCENDER_HEIGHT, |
| 173 | + ); |
| 174 | + |
| 175 | + let lastConsumed = idx; |
| 176 | + for (let j = idx; j < wordTokens.length; j++) { |
| 177 | + const tok = wordTokens[j]; |
| 178 | + const tokWidth = getTextAdvance(tok.text, FONT_SIZE, fontOpentype); |
| 179 | + |
| 180 | + if (tok.isWhitespace) { |
| 181 | + if (lineObj.words.length === 0) { |
| 182 | + // leading whitespace allowed if it fits |
| 183 | + if (widthSoFar + tokWidth > availableWidth) break; |
| 184 | + currentX += tokWidth; |
| 185 | + widthSoFar += tokWidth; |
| 186 | + lastConsumed = j + 1; |
| 187 | + } else { |
| 188 | + // trailing/middle whitespace (allowed even if it exceeds width) |
| 189 | + currentX += tokWidth; |
| 190 | + widthSoFar += tokWidth; |
| 191 | + lastConsumed = j + 1; |
| 192 | + } |
| 193 | + } else { |
| 194 | + if (lineObj.words.length > 0 && widthSoFar + tokWidth > availableWidth) { |
| 195 | + // wrap before this word |
| 196 | + break; |
| 197 | + } |
| 198 | + // place the word |
| 199 | + const wordBbox = { |
| 200 | + left: Math.round(currentX), |
| 201 | + top: lineTop, |
| 202 | + right: Math.round(currentX + tokWidth), |
| 203 | + bottom: lineBottom, |
| 204 | + }; |
| 205 | + const wordId = `word_${pageIndex + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`; |
| 206 | + const wordObj = new ocr.OcrWord(lineObj, tok.text, wordBbox, wordId); |
| 207 | + wordObj.conf = 100; |
| 208 | + wordObj.style.font = FONT_FAMILY; |
| 209 | + lineObj.words.push(wordObj); |
| 210 | + |
| 211 | + currentX += tokWidth; |
| 212 | + widthSoFar += tokWidth; |
| 213 | + lastConsumed = j + 1; |
| 214 | + } |
| 215 | + } |
| 216 | + |
| 217 | + // Extreme edge case: force place a long word when nothing fit and next token is a non-whitespace word |
| 218 | + if (lineObj.words.length === 0) { |
| 219 | + const nextTok = wordTokens[idx]; |
| 220 | + if (nextTok && !nextTok.isWhitespace) { |
| 221 | + const tokWidth = getTextAdvance(nextTok.text, FONT_SIZE, fontOpentype); |
| 222 | + const wordBbox = { |
| 223 | + left: Math.round(currentX), |
| 224 | + top: lineTop, |
| 225 | + right: Math.round(currentX + tokWidth), |
| 226 | + bottom: lineBottom, |
| 227 | + }; |
| 228 | + const wordId = `word_${pageIndex + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`; |
| 229 | + const wordObj = new ocr.OcrWord(lineObj, nextTok.text, wordBbox, wordId); |
| 230 | + wordObj.conf = 100; |
| 231 | + wordObj.style.font = FONT_FAMILY; |
| 232 | + lineObj.words.push(wordObj); |
| 233 | + currentX += tokWidth; |
| 234 | + widthSoFar += tokWidth; |
| 235 | + lastConsumed = idx + 1; |
| 236 | + } else { |
| 237 | + // Can't place oversized leading whitespace; stop processing this paragraph |
| 238 | + break; |
| 239 | + } |
| 240 | + } |
| 241 | + |
| 242 | + if (lineObj.words.length > 0) { |
| 243 | + lineObj.bbox = { |
| 244 | + left: lineObj.words[0].bbox.left, |
| 245 | + top: lineTop, |
| 246 | + right: Math.round(currentX), |
| 247 | + bottom: lineBottom, |
| 248 | + }; |
| 249 | + |
| 250 | + pageObj.lines.push(lineObj); |
| 251 | + parLines.push(lineObj); |
| 252 | + parRight = Math.max(parRight, lineObj.bbox.right); |
| 253 | + |
| 254 | + currentY += LINE_HEIGHT; |
| 255 | + idx = lastConsumed; |
| 256 | + } |
| 257 | + } |
| 258 | + |
| 259 | + if (parLines.length > 0) { |
| 260 | + const parBbox = { |
| 261 | + left: margin, |
| 262 | + top: parLines[0].bbox.top, |
| 263 | + right: parRight, |
| 264 | + bottom: parLines[parLines.length - 1].bbox.bottom, |
| 265 | + }; |
| 266 | + const parObj = new ocr.OcrPar(pageObj, parBbox); |
| 267 | + parObj.lines = parLines; |
| 268 | + for (const ln of parLines) ln.par = parObj; |
| 269 | + pageObj.pars.push(parObj); |
| 270 | + } |
| 271 | + } |
| 272 | + |
| 273 | + pageObj.angle = 0; |
| 274 | + |
| 275 | + return pagesOut; |
| 276 | +} |
0 commit comments