Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 26 additions & 15 deletions src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,16 @@ protected function decodeStream(string $pdfData, array $xref, array $sdic, strin
/**
* Decode the Cross-Reference section
*
* @param string $pdfData PDF data
* @param int $startxref Offset at which the xref section starts (position of the 'xref' keyword)
* @param array $xref Previous xref array (if any)
* @param string $pdfData PDF data
* @param int $startxref Offset at which the xref section starts (position of the 'xref' keyword)
* @param array $xref Previous xref array (if any)
* @param array<int> $visitedOffsets Array of visited offsets to prevent infinite loops
*
* @return array containing xref and trailer data
*
* @throws \Exception
*/
protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
protected function decodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
{
$startxref += 4; // 4 is the length of the word 'xref'
// skip initial white space chars
Expand Down Expand Up @@ -219,7 +220,7 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [])
$offset = (int) $matches[1];
if (0 != $offset) {
// get previous xref
$xref = $this->getXrefData($pdfData, $offset, $xref);
$xref = $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
}
}
} else {
Expand All @@ -232,15 +233,16 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [])
/**
* Decode the Cross-Reference Stream section
*
* @param string $pdfData PDF data
* @param int $startxref Offset at which the xref section starts
* @param array $xref Previous xref array (if any)
* @param string $pdfData PDF data
* @param int $startxref Offset at which the xref section starts
* @param array $xref Previous xref array (if any)
* @param array<int> $visitedOffsets Array of visited offsets to prevent infinite loops
*
* @return array containing xref and trailer data
*
* @throws \Exception if unknown PNG predictor detected
*/
protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
{
// try to read Cross-Reference Stream
$xrefobj = $this->getRawObject($pdfData, $startxref);
Expand Down Expand Up @@ -502,7 +504,7 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
} // end decoding data
if (isset($prevxref)) {
// get previous xref
$xref = $this->getXrefData($pdfData, $prevxref, $xref);
$xref = $this->getXrefData($pdfData, $prevxref, $xref, $visitedOffsets);
}

return $xref;
Expand Down Expand Up @@ -862,16 +864,25 @@ private function getHeaderValue(?array $headerDic, string $key, string $type, $d
/**
* Get Cross-Reference (xref) table and trailer data from PDF document data.
*
* @param int $offset xref offset (if known)
* @param array $xref previous xref array (if any)
* @param int $offset xref offset (if known)
* @param array $xref previous xref array (if any)
* @param array<int> $visitedOffsets array of visited offsets to prevent infinite loops
*
* @return array containing xref and trailer data
*
* @throws \Exception if it was unable to find startxref
* @throws \Exception if it was unable to find xref
*/
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
{
// Check for circular references to prevent infinite loops
if (\in_array($offset, $visitedOffsets, true)) {
// We've already processed this offset, skip to avoid infinite loop
return $xref;
}

// Track this offset as visited
$visitedOffsets[] = $offset;
// If the $offset is currently pointed at whitespace, bump it
// forward until it isn't; affects loosely targetted offsets
// for the 'xref' keyword
Expand Down Expand Up @@ -914,7 +925,7 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [
// check xref position
if (strpos($pdfData, 'xref', $startxref) == $startxref) {
// Cross-Reference
$xref = $this->decodeXref($pdfData, $startxref, $xref);
$xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
} else {
// Check if the $pdfData might have the wrong line-endings
$pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
Expand All @@ -923,7 +934,7 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [
$xref = ['Unix' => true];
} else {
// Cross-Reference Stream
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
}
}
if (empty($xref)) {
Expand Down
102 changes: 102 additions & 0 deletions tests/PHPUnit/Integration/RawData/RawDataParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,30 @@ public function exposeGetRawObject($pdfData, $offset = 0)
{
return $this->getRawObject($pdfData, $offset);
}

/**
* Expose protected function "getXrefData".
*/
public function exposeGetXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
{
return $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
}

/**
* Expose protected function "decodeXref".
*/
public function exposeDecodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
{
return $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
}

/**
* Expose protected function "decodeXrefStream".
*/
public function exposeDecodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
{
return $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
}
}

class RawDataParserTest extends TestCase
Expand Down Expand Up @@ -213,4 +237,82 @@ public function testDecodeXrefIssue727(): void

self::assertStringContainsString('', $text);
}

/**
* Test that getXrefData prevents circular references
*
* When a PDF has circular references in xref chain (e.g., Prev pointing to already visited offset),
* the parser should detect this and stop recursion to prevent infinite loops.
*/
public function testGetXrefDataPreventsCircularReferences(): void
{
// Create a minimal PDF structure with xref that would create a circular reference
$pdfData = "%PDF-1.5\n";
$pdfData .= "xref\n";
$pdfData .= "0 1\n";
$pdfData .= "0000000000 65535 f \n";
$pdfData .= "trailer\n";
$pdfData .= "<</Size 1/Prev 7>>\n"; // Prev points back to offset 7 (the xref keyword)
$pdfData .= "startxref\n";
$pdfData .= "7\n";
$pdfData .= "%%EOF\n";

// Test with visitedOffsets containing the offset we're trying to visit
$result = $this->fixture->exposeGetXrefData($pdfData, 7, [], [7]);

// Should return empty xref array without recursing
$this->assertIsArray($result);
$this->assertEmpty($result);
}

/**
* Test that decodeXref passes visitedOffsets correctly when handling Prev
*
* This ensures that circular reference detection works when decodeXref
* calls getXrefData for a Prev pointer.
*/
public function testDecodeXrefPassesVisitedOffsets(): void
{
// Create a minimal xref structure with Prev
$pdfData = "xref\n";
$pdfData .= "0 1\n";
$pdfData .= "0000000000 65535 f \n";
$pdfData .= "trailer\n";
$pdfData .= "<</Size 1/Prev 100>>\n";

// Call decodeXref with visitedOffsets that includes the Prev offset
// This should not cause infinite recursion
$result = $this->fixture->exposeDecodeXref($pdfData, 0, [], [100]);

// Should complete without error and return an array
$this->assertIsArray($result);
$this->assertArrayHasKey('trailer', $result);
}

/**
* Test that getXrefData tracks visited offsets correctly
*
* Ensures that offsets are added to visitedOffsets array to prevent
* circular references in subsequent calls.
*/
public function testGetXrefDataTracksVisitedOffsets(): void
{
// Test that calling with an already-visited offset returns immediately
$pdfData = "%PDF-1.5\n";
$pdfData .= "xref\n";
$pdfData .= "0 1\n";
$pdfData .= "0000000000 65535 f \n";
$pdfData .= "trailer\n";
$pdfData .= "<</Size 1>>\n";
$pdfData .= "startxref\n";
$pdfData .= "7\n";
$pdfData .= "%%EOF\n";

// Call with offset 50 already in visitedOffsets - should return immediately
$result = $this->fixture->exposeGetXrefData($pdfData, 50, [], [50]);

// Should return empty array without processing
$this->assertIsArray($result);
$this->assertEmpty($result);
}
}