Skip to content

Commit 61c9bca

Browse files
k00niCopilot
andauthored
Fix memory exhaustion caused by circular references in PDF xref chains (#787)
* Fix memory exhaustion caused by circular references in PDF xref chains (#2) * Initial plan * Initial setup: Add crash file and reproducer script Co-authored-by: k00ni <[email protected]> * Fix infinite loop caused by circular xref references Co-authored-by: k00ni <[email protected]> * Add test case for circular xref reference fix Co-authored-by: k00ni <[email protected]> * Address review feedback: Add array<int> type hints and remove test - Added array<int> type hints to $visitedOffsets parameters in PHPDoc comments - Removed testCircularXrefReference test as requested by reviewer - Removed reproducer.php file as requested - Crash file memory-exhaustion-crash-f5d89649.txt remains permanently in samples/bugs/ Co-authored-by: k00ni <[email protected]> * Remove crash file and add comprehensive tests for circular reference fix - Removed memory-exhaustion-crash-f5d89649.txt as requested - Added helper methods to expose protected functions for testing - Added testGetXrefDataPreventsCircularReferences: Tests that visitedOffsets prevents infinite loops - Added testDecodeXrefPassesVisitedOffsets: Tests that decodeXref correctly passes visitedOffsets - Added testGetXrefDataTracksVisitedOffsets: Tests that offsets are tracked properly - All tests use minimal PDF structures and don't rely on external crash file Co-authored-by: k00ni <[email protected]> --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: k00ni <[email protected]> * Removed PHP-CS-Fixer issues --------- Co-authored-by: Copilot <[email protected]> Co-authored-by: k00ni <[email protected]>
1 parent 6b52c6b commit 61c9bca

File tree

2 files changed

+128
-15
lines changed

2 files changed

+128
-15
lines changed

src/Smalot/PdfParser/RawData/RawDataParser.php

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,16 @@ protected function decodeStream(string $pdfData, array $xref, array $sdic, strin
152152
/**
153153
* Decode the Cross-Reference section
154154
*
155-
* @param string $pdfData PDF data
156-
* @param int $startxref Offset at which the xref section starts (position of the 'xref' keyword)
157-
* @param array $xref Previous xref array (if any)
155+
* @param string $pdfData PDF data
156+
* @param int $startxref Offset at which the xref section starts (position of the 'xref' keyword)
157+
* @param array $xref Previous xref array (if any)
158+
* @param array<int> $visitedOffsets Array of visited offsets to prevent infinite loops
158159
*
159160
* @return array containing xref and trailer data
160161
*
161162
* @throws \Exception
162163
*/
163-
protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
164+
protected function decodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
164165
{
165166
$startxref += 4; // 4 is the length of the word 'xref'
166167
// skip initial white space chars
@@ -219,7 +220,7 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [])
219220
$offset = (int) $matches[1];
220221
if (0 != $offset) {
221222
// get previous xref
222-
$xref = $this->getXrefData($pdfData, $offset, $xref);
223+
$xref = $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
223224
}
224225
}
225226
} else {
@@ -232,15 +233,16 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [])
232233
/**
233234
* Decode the Cross-Reference Stream section
234235
*
235-
* @param string $pdfData PDF data
236-
* @param int $startxref Offset at which the xref section starts
237-
* @param array $xref Previous xref array (if any)
236+
* @param string $pdfData PDF data
237+
* @param int $startxref Offset at which the xref section starts
238+
* @param array $xref Previous xref array (if any)
239+
* @param array<int> $visitedOffsets Array of visited offsets to prevent infinite loops
238240
*
239241
* @return array containing xref and trailer data
240242
*
241243
* @throws \Exception if unknown PNG predictor detected
242244
*/
243-
protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
245+
protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
244246
{
245247
// try to read Cross-Reference Stream
246248
$xrefobj = $this->getRawObject($pdfData, $startxref);
@@ -502,7 +504,7 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
502504
} // end decoding data
503505
if (isset($prevxref)) {
504506
// get previous xref
505-
$xref = $this->getXrefData($pdfData, $prevxref, $xref);
507+
$xref = $this->getXrefData($pdfData, $prevxref, $xref, $visitedOffsets);
506508
}
507509

508510
return $xref;
@@ -862,16 +864,25 @@ private function getHeaderValue(?array $headerDic, string $key, string $type, $d
862864
/**
863865
* Get Cross-Reference (xref) table and trailer data from PDF document data.
864866
*
865-
* @param int $offset xref offset (if known)
866-
* @param array $xref previous xref array (if any)
867+
* @param int $offset xref offset (if known)
868+
* @param array $xref previous xref array (if any)
869+
* @param array<int> $visitedOffsets array of visited offsets to prevent infinite loops
867870
*
868871
* @return array containing xref and trailer data
869872
*
870873
* @throws \Exception if it was unable to find startxref
871874
* @throws \Exception if it was unable to find xref
872875
*/
873-
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
876+
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
874877
{
878+
// Check for circular references to prevent infinite loops
879+
if (\in_array($offset, $visitedOffsets, true)) {
880+
// We've already processed this offset, skip to avoid infinite loop
881+
return $xref;
882+
}
883+
884+
// Track this offset as visited
885+
$visitedOffsets[] = $offset;
875886
// If the $offset is currently pointed at whitespace, bump it
876887
// forward until it isn't; affects loosely targetted offsets
877888
// for the 'xref' keyword
@@ -914,7 +925,7 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [
914925
// check xref position
915926
if (strpos($pdfData, 'xref', $startxref) == $startxref) {
916927
// Cross-Reference
917-
$xref = $this->decodeXref($pdfData, $startxref, $xref);
928+
$xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
918929
} else {
919930
// Check if the $pdfData might have the wrong line-endings
920931
$pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
@@ -923,7 +934,7 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [
923934
$xref = ['Unix' => true];
924935
} else {
925936
// Cross-Reference Stream
926-
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
937+
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
927938
}
928939
}
929940
if (empty($xref)) {

tests/PHPUnit/Integration/RawData/RawDataParserTest.php

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,30 @@ public function exposeGetRawObject($pdfData, $offset = 0)
4848
{
4949
return $this->getRawObject($pdfData, $offset);
5050
}
51+
52+
/**
53+
* Expose protected function "getXrefData".
54+
*/
55+
public function exposeGetXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
56+
{
57+
return $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
58+
}
59+
60+
/**
61+
* Expose protected function "decodeXref".
62+
*/
63+
public function exposeDecodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
64+
{
65+
return $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
66+
}
67+
68+
/**
69+
* Expose protected function "decodeXrefStream".
70+
*/
71+
public function exposeDecodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
72+
{
73+
return $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
74+
}
5175
}
5276

5377
class RawDataParserTest extends TestCase
@@ -213,4 +237,82 @@ public function testDecodeXrefIssue727(): void
213237

214238
self::assertStringContainsString('', $text);
215239
}
240+
241+
/**
242+
* Test that getXrefData prevents circular references
243+
*
244+
* When a PDF has circular references in xref chain (e.g., Prev pointing to already visited offset),
245+
* the parser should detect this and stop recursion to prevent infinite loops.
246+
*/
247+
public function testGetXrefDataPreventsCircularReferences(): void
248+
{
249+
// Create a minimal PDF structure with xref that would create a circular reference
250+
$pdfData = "%PDF-1.5\n";
251+
$pdfData .= "xref\n";
252+
$pdfData .= "0 1\n";
253+
$pdfData .= "0000000000 65535 f \n";
254+
$pdfData .= "trailer\n";
255+
$pdfData .= "<</Size 1/Prev 7>>\n"; // Prev points back to offset 7 (the xref keyword)
256+
$pdfData .= "startxref\n";
257+
$pdfData .= "7\n";
258+
$pdfData .= "%%EOF\n";
259+
260+
// Test with visitedOffsets containing the offset we're trying to visit
261+
$result = $this->fixture->exposeGetXrefData($pdfData, 7, [], [7]);
262+
263+
// Should return empty xref array without recursing
264+
$this->assertIsArray($result);
265+
$this->assertEmpty($result);
266+
}
267+
268+
/**
269+
* Test that decodeXref passes visitedOffsets correctly when handling Prev
270+
*
271+
* This ensures that circular reference detection works when decodeXref
272+
* calls getXrefData for a Prev pointer.
273+
*/
274+
public function testDecodeXrefPassesVisitedOffsets(): void
275+
{
276+
// Create a minimal xref structure with Prev
277+
$pdfData = "xref\n";
278+
$pdfData .= "0 1\n";
279+
$pdfData .= "0000000000 65535 f \n";
280+
$pdfData .= "trailer\n";
281+
$pdfData .= "<</Size 1/Prev 100>>\n";
282+
283+
// Call decodeXref with visitedOffsets that includes the Prev offset
284+
// This should not cause infinite recursion
285+
$result = $this->fixture->exposeDecodeXref($pdfData, 0, [], [100]);
286+
287+
// Should complete without error and return an array
288+
$this->assertIsArray($result);
289+
$this->assertArrayHasKey('trailer', $result);
290+
}
291+
292+
/**
293+
* Test that getXrefData tracks visited offsets correctly
294+
*
295+
* Ensures that offsets are added to visitedOffsets array to prevent
296+
* circular references in subsequent calls.
297+
*/
298+
public function testGetXrefDataTracksVisitedOffsets(): void
299+
{
300+
// Test that calling with an already-visited offset returns immediately
301+
$pdfData = "%PDF-1.5\n";
302+
$pdfData .= "xref\n";
303+
$pdfData .= "0 1\n";
304+
$pdfData .= "0000000000 65535 f \n";
305+
$pdfData .= "trailer\n";
306+
$pdfData .= "<</Size 1>>\n";
307+
$pdfData .= "startxref\n";
308+
$pdfData .= "7\n";
309+
$pdfData .= "%%EOF\n";
310+
311+
// Call with offset 50 already in visitedOffsets - should return immediately
312+
$result = $this->fixture->exposeGetXrefData($pdfData, 50, [], [50]);
313+
314+
// Should return empty array without processing
315+
$this->assertIsArray($result);
316+
$this->assertEmpty($result);
317+
}
216318
}

0 commit comments

Comments
 (0)