Skip to content

Commit 2e0c7f2

Browse files
committed
fix(parser): resolve stream /Length from object streams
Stream /Length values stored as indirect references to objects inside object streams would fail with 'Could not resolve /Length reference'. The lengthResolver now handles compressed objects by loading from the object stream, using the existing objectStreamCache for efficiency.
1 parent 97d3c66 commit 2e0c7f2

File tree

3 files changed

+78
-20
lines changed

3 files changed

+78
-20
lines changed
675 Bytes
Binary file not shown.

src/parser/document-parser.test.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { Scanner } from "#src/io/scanner";
22
import { PdfDict } from "#src/objects/pdf-dict.ts";
33
import { PdfRef } from "#src/objects/pdf-ref";
4+
import { PdfStream } from "#src/objects/pdf-stream";
45
import { loadFixture } from "#src/test-utils";
56
import { describe, expect, it } from "vitest";
67

@@ -1033,5 +1034,37 @@ describe("DocumentParser", () => {
10331034
// PDFBox: assertEquals(1, doc.getNumberOfPages())
10341035
expect(doc.getPageCount()).toBe(1);
10351036
});
1037+
1038+
// Stream /Length as indirect reference to object in object stream.
1039+
// The stream dict has /Length 6 0 R, but object 6 is stored compressed
1040+
// in object stream 10, not as a standalone object. This tests that the
1041+
// lengthResolver can handle compressed objects.
1042+
// Note: This is valid PDF per spec - only object stream's own /Length
1043+
// cannot be in an object stream (section 7.5.7).
1044+
it("resolves stream /Length from object inside object stream", async () => {
1045+
const bytes = await loadFixture("xref", "length-in-object-stream.pdf");
1046+
const scanner = new Scanner(bytes);
1047+
const parser = new DocumentParser(scanner);
1048+
1049+
const doc = parser.parse();
1050+
1051+
expect(doc.version).toBe("1.7");
1052+
expect(doc.warnings).toHaveLength(0); // Should parse cleanly, no brute-force
1053+
1054+
const catalog = doc.getCatalog();
1055+
expect(catalog).not.toBeNull();
1056+
1057+
expect(doc.getPageCount()).toBe(1);
1058+
1059+
// Load the content stream (object 5) which has /Length 6 0 R,
1060+
// where object 6 is compressed in object stream 10
1061+
const contentsStream = doc.getObject(PdfRef.of(5, 0));
1062+
expect(contentsStream).not.toBeNull();
1063+
expect(contentsStream).toBeInstanceOf(PdfStream);
1064+
1065+
// Verify stream data was correctly read using the resolved length
1066+
const stream = contentsStream as PdfStream;
1067+
expect(stream.data.length).toBe(43); // "BT /F1 12 Tf 100 700 Td (Hello World) Tj ET"
1068+
});
10361069
});
10371070
});

src/parser/document-parser.ts

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -466,40 +466,65 @@ export class DocumentParser {
466466

467467
// Create length resolver for stream objects with indirect /Length
468468
const lengthResolver: LengthResolver = (ref: PdfRef) => {
469-
// Synchronous lookup in cache only - can't do async here
470-
const key = `${ref.objectNumber} ${ref.generation}`;
471-
472-
const cached = cache.get(key);
469+
// Check object cache first
470+
const cacheKey = `${ref.objectNumber} ${ref.generation}`;
471+
const cached = cache.get(cacheKey);
473472

474-
if (cached && cached.type === "number") {
473+
if (cached instanceof PdfNumber) {
475474
return cached.value;
476475
}
477476

478-
// Try to parse synchronously if it's a simple uncompressed object
479477
const entry = xref.get(ref.objectNumber);
480478

481-
if (entry?.type === "uncompressed") {
482-
// Save scanner position - we must restore it after parsing the length
483-
// because we're in the middle of parsing a stream
484-
const savedPosition = this.scanner.position;
479+
if (!entry || entry.type === "free") {
480+
return null;
481+
}
482+
483+
// Save scanner position - we must restore it after parsing
484+
// because we're in the middle of parsing a stream
485+
const savedPosition = this.scanner.position;
485486

486-
try {
487+
try {
488+
let lengthObj: PdfObject | null = null;
489+
490+
if (entry.type === "uncompressed") {
487491
const parser = new IndirectObjectParser(this.scanner);
492+
lengthObj = parser.parseObjectAt(entry.offset).value;
493+
} else {
494+
// Compressed: load from object stream
495+
// Object streams themselves must be uncompressed (per PDF spec 7.5.7)
496+
const streamEntry = xref.get(entry.streamObjNum);
488497

489-
const result = parser.parseObjectAt(entry.offset);
498+
if (streamEntry?.type === "uncompressed") {
499+
// Use cached object stream parser if available
500+
let objStreamParser = objectStreamCache.get(entry.streamObjNum);
490501

491-
if (result.value.type === "number") {
492-
cache.set(key, result.value);
502+
if (!objStreamParser) {
503+
const parser = new IndirectObjectParser(this.scanner);
504+
const streamResult = parser.parseObjectAt(streamEntry.offset);
493505

494-
// Restore scanner position before returning
495-
this.scanner.moveTo(savedPosition);
506+
if (streamResult.value instanceof PdfStream) {
507+
objStreamParser = new ObjectStreamParser(streamResult.value);
496508

497-
return result.value.value;
509+
objectStreamCache.set(entry.streamObjNum, objStreamParser);
510+
}
511+
}
512+
513+
if (objStreamParser) {
514+
lengthObj = objStreamParser.getObject(entry.indexInStream);
515+
}
498516
}
499-
} catch {
500-
// Restore scanner position and fall through to return null
501-
this.scanner.moveTo(savedPosition);
502517
}
518+
519+
this.scanner.moveTo(savedPosition);
520+
521+
if (lengthObj instanceof PdfNumber) {
522+
cache.set(cacheKey, lengthObj);
523+
524+
return lengthObj.value;
525+
}
526+
} catch {
527+
this.scanner.moveTo(savedPosition);
503528
}
504529

505530
return null;

0 commit comments

Comments
 (0)