Skip to content

Commit 6b52c6b

Browse files
authored
Ignore Form as well as Image XObjects when assembling the text array for a PDFObject. (#783)
* Ignore Form as well as Image XObjects when assembling the text array for a PDFObject. * Add test coverage for the change.
1 parent 370b7e9 commit 6b52c6b

File tree

2 files changed

+22
-6
lines changed

2 files changed

+22
-6
lines changed

src/Smalot/PdfParser/PDFObject.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -788,8 +788,9 @@ public function getTextArray(?Page $page = null): array
788788
break;
789789
}
790790

791-
// If the PDFObject is an image, do nothing, as images aren't text.
792-
if ($xobject instanceof Image) {
791+
// If the PDFObject is an Image or a Form, do nothing as
792+
// neither of these XObject types are text.
793+
if ($xobject instanceof Image || $xobject instanceof Form) {
793794
break;
794795
}
795796

tests/PHPUnit/Unit/PDFObjectTest.php

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
use Smalot\PdfParser\Header;
1212
use Smalot\PdfParser\Page;
1313
use Smalot\PdfParser\PDFObject;
14+
use Smalot\PdfParser\XObject\Form;
1415
use Smalot\PdfParser\XObject\Image;
1516

1617
class PDFObjectTest extends TestCase
@@ -33,6 +34,7 @@ public function testTextArrayObjects(): void
3334
$document->init();
3435

3536
$image = new Image($document);
37+
$form = new Form($document);
3638
$xObject = new PDFObject($document);
3739

3840
$header1 = new Header([
@@ -41,24 +43,37 @@ public function testTextArrayObjects(): void
4143
'Im0' => $image,
4244
])
4345
]),
44-
'Contents' => new ElementArray([new Element('/Imo Do', $document)], $document),
46+
'Contents' => new ElementArray([new Element('/Im0 Do', $document)], $document),
4547
]);
4648
$page1 = new Page($document, $header1);
4749

4850
$header2 = new Header([
51+
'Resources' => new Header([
52+
'XObject' => new Header([
53+
'Fr0' => $form,
54+
])
55+
]),
56+
'Contents' => new ElementArray([new Element('/Fr0 Do', $document)], $document),
57+
]);
58+
$page2 = new Page($document, $header2);
59+
60+
$header3 = new Header([
4961
'Resources' => new Header([
5062
'XObject' => new Header([
5163
'Ps0' => $xObject,
5264
])
5365
]),
5466
'Contents' => new ElementArray([new Element('/Ps0 Do', $document)], $document),
5567
]);
56-
$page2 = new Page($document, $header2);
68+
$page3 = new Page($document, $header3);
5769

5870
// Page 1 contains an image, which should not appear in the text array.
5971
self::assertSame([], $page1->getTextArray());
6072

61-
// Page 2 contains a non-image object, which should appear in the text array.
62-
self::assertSame([' '], $page2->getTextArray());
73+
// Page 2 contains a form, which should not appear in the text array.
74+
self::assertSame([], $page2->getTextArray());
75+
76+
// Page 3 contains a non-image object, which should appear in the text array.
77+
self::assertSame([' '], $page3->getTextArray());
6378
}
6479
}

0 commit comments

Comments
 (0)