Skip to content

Commit b193d9f

Browse files
authored
fix: #355, #361, #319: calculate text block gap and spacewidth from fontMatrix to preserve spaces in both content.txt and json output (#411)
* Uses actual glyph-based width calcualtion instead of estimates * Correctly handles coordinate systems (scaled positions, unscaled widths) * Applies textHScale for compressed/expanded text * Dynamic Y-tolerance based on font size (fontSize × 0.15)
1 parent 7b05aa9 commit b193d9f

File tree

2 files changed

+85
-7
lines changed

2 files changed

+85
-7
lines changed

base/core/evaluator.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -987,6 +987,38 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
987987
bidiText.x += renderParams.vScale / 2;
988988
bidiText.y -= renderParams.vScale;
989989
}
990+
991+
// MQZ: Add font metrics for accurate spacing calculation
992+
bidiText.fontName = font.loadedName || font.name;
993+
bidiText.fontSize = textState.fontSize;
994+
995+
// Get fontMatrix once (used for both spaceWidth and textWidth calculations)
996+
var fontMatrix = font.fontMatrix || FONT_IDENTITY_MATRIX;
997+
var fontDirection = textState.fontDirection || 1;
998+
999+
// Scale spaceWidth to PDF coordinates using fontMatrix (NO textHScale)
1000+
// Must match canvas.js canvasWidth calculation (line 1258 - no textHScale)
1001+
bidiText.spaceWidth = font.spaceWidth * textState.fontSize * fontMatrix[0];
1002+
bidiText.charSpace = charSpace;
1003+
bidiText.wordSpace = wordSpace;
1004+
bidiText.textHScale = textState.textHScale;
1005+
1006+
// Calculate actual text width using font glyph widths
1007+
// Match canvas.js calculation exactly (lines 1210-1211, 1258, canvasWidth does NOT include textHScale)
1008+
var textWidth = 0;
1009+
var glyphs = font.charsToGlyphs(chunk);
1010+
for (var i = 0, ii = glyphs.length; i < ii; i++) {
1011+
var glyph = glyphs[i];
1012+
// Use glyph.width if available, otherwise font.defaultWidth (like canvas.js does)
1013+
var glyphWidth = (glyph && glyph.width) || font.defaultWidth || 0;
1014+
// Match canvas.js line 1210-1211: width * fontSize * fontMatrix[0] + charSpacing * fontDirection
1015+
var charWidth = glyphWidth * textState.fontSize * fontMatrix[0] + charSpace * fontDirection;
1016+
textWidth += charWidth;
1017+
}
1018+
// DO NOT apply textHScale - canvasWidth is in unscaled coordinates
1019+
// (bidiText.x is scaled, but bidiText.width matches JSON w property which is unscaled)
1020+
bidiText.width = textWidth;
1021+
9901022
bidiTexts.push(bidiText);
9911023

9921024
chunk = '';

lib/pdf.js

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -327,18 +327,64 @@ export default class PDFJSClass extends EventEmitter {
327327

328328
this.rawTextContents.forEach((textContent, index) => {
329329
let prevText = null;
330+
330331
textContent.bidiTexts.forEach((textObj, idx) => {
331-
if (prevText) {
332-
if (Math.abs(textObj.y - prevText.y) <= 9) {
333-
prevText.str += textObj.str;
334-
} else {
335-
retVal += `${prevText.str}\r\n`;
336-
prevText = textObj;
332+
// Check if on same line
333+
// Use a tolerance relative to font size for better accuracy
334+
// Typical line spacing is 120% of font size, so 10-15% tolerance is reasonable
335+
const tolerance = prevText ? (prevText.fontSize || 12) * 0.15 : 2;
336+
const sameLine = prevText && Math.abs(textObj.y - prevText.y) <= tolerance;
337+
338+
if (sameLine) {
339+
// spaceWidth is in unscaled coordinates (no textHScale, matching JSON w property)
340+
const { spaceWidth, startX, width, textHScale } = prevText;
341+
342+
// Use actual calculated text width (from glyph widths)
343+
// width is in unscaled coordinates, but startX is in scaled coordinates
344+
// So we must apply textHScale to width before adding to startX
345+
// This matches canvas.js: current.x += x * textHScale (line 1267)
346+
const prevTextEndX = startX + (width * textHScale);
347+
348+
// Calculate gap between end of previous text and start of current text
349+
// gap is in SCALED coordinates (both textObj.x and prevTextEndX are scaled)
350+
const gap = textObj.x - prevTextEndX;
351+
352+
// Scale spaceWidth to match gap's coordinate system
353+
const scaledSpaceWidth = spaceWidth * textHScale;
354+
355+
// Add spaces if gap is positive and significant (> 30% of scaled space width)
356+
if (gap > scaledSpaceWidth * 0.3) {
357+
const numSpaces = Math.round(gap / scaledSpaceWidth);
358+
prevText.str += ' '.repeat(Math.max(1, numSpaces));
337359
}
360+
361+
// Append current text
362+
prevText.str += textObj.str;
363+
364+
// Update prevText to track current text for next iteration
365+
prevText.startX = textObj.x;
366+
prevText.width = textObj.width;
367+
prevText.spaceWidth = textObj.spaceWidth;
368+
prevText.textHScale = textObj.textHScale;
338369
} else {
339-
prevText = textObj;
370+
// Different line or first text
371+
if (prevText) {
372+
retVal += `${prevText.str}\r\n`;
373+
}
374+
375+
// Initialize new text object with font metrics
376+
prevText = {
377+
str: textObj.str,
378+
y: textObj.y,
379+
startX: textObj.x,
380+
width: textObj.width,
381+
spaceWidth: textObj.spaceWidth,
382+
textHScale: textObj.textHScale,
383+
fontSize: textObj.fontSize // Keep for tolerance calculation
384+
};
340385
}
341386
});
387+
342388
if (prevText) {
343389
retVal += prevText.str;
344390
}

0 commit comments

Comments
 (0)