Skip to content

Commit 5569bf7

Browse files
authored
fix: #408: fix text block coordinates, add tests (#409)
1 parent 1faf820 commit 5569bf7

File tree

3 files changed

+46
-6
lines changed

3 files changed

+46
-6
lines changed

base/display/canvas.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,10 +1269,6 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
12691269
// info(nodeUtil.inspect(glyphs));
12701270
// }
12711271

1272-
1273-
ctx.restore();
1274-
}
1275-
12761272
// Text rendering for regular fonts (Type3 fonts are handled in their own context above)
12771273
if (str && !font.disableFontFace && !font.coded) {
12781274
var curFontSize = fontSize * scale * textHScale + 3;
@@ -1293,6 +1289,9 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
12931289
}
12941290
}
12951291

1292+
ctx.restore();
1293+
}
1294+
12961295
return canvasWidth;
12971296
},
12981297
showSpacedText: function CanvasGraphics_showSpacedText(arr) {

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "pdf2json",
3-
"version": "3.2.2",
3+
"version": "3.2.3",
44
"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",
55
"keywords": [
66
"pdf",

test/_test_.cjs

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,46 @@ function checkResult_pageContent(Pages, fileName) {
163163
});
164164
}
165165

166+
function checkResult_textCoordinates(Pages, fileName) {
167+
// Verify text block coordinates are unique (issue #408 regression test)
168+
Pages.forEach((page, pageIndex) => {
169+
const texts = page.Texts || [];
170+
if (texts.length === 0) return; // Skip pages with no text
171+
172+
// Collect all coordinates
173+
const coords = texts.map(t => ({ x: t.x, y: t.y }));
174+
175+
// Create unique coordinate strings
176+
const uniqueCoords = new Set(coords.map(c => `${c.x},${c.y}`));
177+
178+
// Check that we have more than one unique coordinate if we have multiple text elements
179+
// This prevents the regression where all text elements had identical coordinates (-0.25, 48.75)
180+
if (texts.length > 5) {
181+
assert(
182+
uniqueCoords.size > 1,
183+
fileName + " page " + pageIndex +
184+
" : all " + texts.length + " text elements have identical coordinates. " +
185+
"This is a regression of issue #408. Found only " + uniqueCoords.size +
186+
" unique coordinate(s): " + Array.from(uniqueCoords).slice(0, 3).join(", ")
187+
);
188+
}
189+
190+
// Verify coordinates are reasonable (not all the same broken value)
191+
texts.forEach((text, textIndex) => {
192+
assert(
193+
typeof text.x === 'number' && !isNaN(text.x),
194+
fileName + " page " + pageIndex + " text " + textIndex +
195+
" : has invalid x coordinate: " + text.x
196+
);
197+
assert(
198+
typeof text.y === 'number' && !isNaN(text.y),
199+
fileName + " page " + pageIndex + " text " + textIndex +
200+
" : has invalid y coordinate: " + text.y
201+
);
202+
});
203+
});
204+
}
205+
166206
async function parseAndVerifyOnePDF(fileName, fromBuffer, pageCount) {
167207
let timeoutId;
168208
let pdfParser = null;
@@ -203,12 +243,13 @@ async function parseAndVerifyOnePDF(fileName, fromBuffer, pageCount) {
203243
});
204244

205245
const evtData = await pdfParserDataReady;
206-
246+
207247
expect(evtData).toBeDefined();
208248
checkResult_parseStatus(null, evtData, fileName);
209249
checkResult_mainFields(evtData, fileName);
210250
checkResult_pageCount(evtData.Pages, pageCount, fileName);
211251
checkResult_pageContent(evtData.Pages, fileName);
252+
checkResult_textCoordinates(evtData.Pages, fileName);
212253
} catch (error) {
213254
console.error(`Error parsing PDF ${fileName}: `, error);
214255
throw error; // Re-throw to ensure Jest knows the test failed

0 commit comments

Comments
 (0)