Skip to content

Commit 5c2d6b3

Browse files
authored
fix: handle off-by-one xref subsection start in malformed PDFs (#38)
Some PDFs have a malformed xref table where the subsection header reports the starting object number as 1 instead of 0, while the entries clearly start at object 0 (free list head with gen 65535). This shifted all object resolutions by one, causing wrong page count and infinite loops in extractPages. - Detect and correct the off-by-one in xref subsection parsing - Add cycle detection to ObjectCopier.getInheritedAttribute as defense-in-depth - Add unit test for xref correction and integration test with fixtures
1 parent 035b482 commit 5c2d6b3

File tree

5 files changed

+92
-3
lines changed

5 files changed

+92
-3
lines changed
1.08 KB
Binary file not shown.

src/api/pdf.test.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,23 @@ describe("PDF", () => {
702702

703703
expect(extracted.getPageCount()).toBe(0);
704704
});
705+
706+
it("handles PDF with off-by-one xref subsection start", async () => {
707+
// Some malformed PDFs have the xref subsection header saying "1 N"
708+
// instead of "0 N", shifting all object numbers by one. This caused
709+
// wrong page count and infinite loop in extractPages due to objects
710+
// resolving to wrong offsets (e.g., Pages root resolving as a Page
711+
// with a self-referencing Parent).
712+
const bytes = await loadFixture("malformed", "xref-off-by-one.pdf");
713+
const pdf = await PDF.load(bytes);
714+
715+
expect(pdf.getPageCount()).toBe(3);
716+
expect(pdf.getPages()[2].width).toBe(300);
717+
expect(pdf.getPages()[2].height).toBe(400);
718+
719+
const extracted = await pdf.extractPages([0, 1, 2]);
720+
expect(extracted.getPageCount()).toBe(3);
721+
});
705722
});
706723

707724
describe("embedPage and drawPage", () => {

src/document/object-copier.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,7 @@ export class ObjectCopier {
469469
*/
470470
private getInheritedAttribute(page: PdfDict, key: string): PdfObject | null {
471471
let current: PdfDict | null = page;
472+
const visited = new Set<string>();
472473

473474
while (current) {
474475
const value = current.get(key);
@@ -483,6 +484,14 @@ export class ObjectCopier {
483484
break;
484485
}
485486

487+
const refKey = `${parentRef.objectNumber}:${parentRef.generation}`;
488+
489+
if (visited.has(refKey)) {
490+
break;
491+
}
492+
493+
visited.add(refKey);
494+
486495
const parent = this.source.getObject(parentRef);
487496
current = parent instanceof PdfDict ? parent : null;
488497
}

src/parser/xref-parser.test.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,47 @@ trailer
180180

181181
expect(result.entries.size).toBe(2);
182182
});
183+
184+
it("corrects off-by-one subsection start when free list head is at wrong position", () => {
185+
// Some malformed PDFs report firstObjNum=1 when entries actually start at 0.
186+
// The free list head (gen 65535, type f) is always object 0.
187+
const p = parser(`xref
188+
1 4
189+
0000000000 65535 f
190+
0000000015 00000 n
191+
0000000074 00000 n
192+
0000000120 00000 n
193+
trailer
194+
<< /Size 4 /Root 1 0 R >>
195+
`);
196+
const result = p.parseTable();
197+
198+
expect(result.entries.size).toBe(4);
199+
200+
// Entry should be corrected to object 0 (not 1)
201+
const entry0 = result.entries.get(0);
202+
expect(entry0).toBeDefined();
203+
expect(entry0!.type).toBe("free");
204+
if (entry0!.type === "free") {
205+
expect(entry0!.generation).toBe(65535);
206+
}
207+
208+
// Object 1 should be at offset 15
209+
const entry1 = result.entries.get(1);
210+
expect(entry1).toBeDefined();
211+
expect(entry1!.type).toBe("uncompressed");
212+
if (entry1!.type === "uncompressed") {
213+
expect(entry1!.offset).toBe(15);
214+
}
215+
216+
// Object 3 should be at offset 120
217+
const entry3 = result.entries.get(3);
218+
expect(entry3).toBeDefined();
219+
expect(entry3!.type).toBe("uncompressed");
220+
if (entry3!.type === "uncompressed") {
221+
expect(entry3!.offset).toBe(120);
222+
}
223+
});
183224
});
184225

185226
describe("trailer parsing", () => {

src/parser/xref-parser.ts

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -353,10 +353,32 @@ export class XRefParser {
353353
this.skipWhitespaceFromCurrent();
354354

355355
// Read entries
356+
const parsedEntries: XRefEntry[] = [];
357+
356358
for (let i = 0; i < count; i++) {
357-
const objNum = firstObjNum + i;
358-
const entry = this.parseEntry();
359-
entries.set(objNum, entry);
359+
parsedEntries.push(this.parseEntry());
360+
}
361+
362+
// Detect off-by-one in subsection start: some malformed PDFs report
363+
// firstObjNum=1 when the entries actually start at object 0.
364+
// The free list head (generation 65535, type free) is always object 0,
365+
// so if we see it at position 1, correct it. (Same fix as pdf.js #3248/#7229)
366+
let correctedFirstObjNum = firstObjNum;
367+
368+
if (
369+
firstObjNum === 1 &&
370+
parsedEntries.length > 0 &&
371+
parsedEntries[0].type === "free" &&
372+
parsedEntries[0].generation === 65535
373+
) {
374+
correctedFirstObjNum = 0;
375+
console.warn(
376+
"XRef: corrected subsection start from 1 to 0 (free list head at wrong position)",
377+
);
378+
}
379+
380+
for (let i = 0; i < parsedEntries.length; i++) {
381+
entries.set(correctedFirstObjNum + i, parsedEntries[i]);
360382
}
361383
}
362384

0 commit comments

Comments
 (0)