Skip to content

Commit a3d47d6

Browse files
authored
Implement missing features for literal string escape sequences (#104)
1 parent 7a2d449 commit a3d47d6

File tree

3 files changed

+123
-4
lines changed

3 files changed

+123
-4
lines changed

src/Document/ContentStream/PositionedText/PositionedTextElement.php

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
namespace PrinsFrank\PdfParser\Document\ContentStream\PositionedText;
44

5+
use PrinsFrank\PdfParser\Document\Generic\Character\LiteralStringEscapeCharacter;
56
use PrinsFrank\PdfParser\Document\Object\Decorator\Font;
67
use PrinsFrank\PdfParser\Exception\ParseFailureException;
78

@@ -23,9 +24,7 @@ public function getText(?Font $font): string {
2324
$string = '';
2425
foreach ($matches as $match) {
2526
if (str_starts_with($match['chars'], '(') && str_ends_with($match['chars'], ')')) {
26-
$chars = str_replace(['\(', '\)', '\n', '\r'], ['(', ')', "\n", "\r"], substr($match['chars'], 1, -1));
27-
$chars = preg_replace_callback('/\\\\([0-7]{3})/', fn (array $matches) => mb_chr((int) octdec($matches[1])), $chars)
28-
?? throw new ParseFailureException();
27+
$chars = LiteralStringEscapeCharacter::unescapeCharacters(substr($match['chars'], 1, -1));
2928
if ($font !== null && ($encoding = $font->getEncoding()) !== null) {
3029
$chars = $encoding->decodeString($chars);
3130
}

src/Document/Generic/Character/LiteralStringEscapeCharacter.php

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
namespace PrinsFrank\PdfParser\Document\Generic\Character;
55

6+
use PrinsFrank\PdfParser\Exception\ParseFailureException;
7+
68
/**
79
* @internal
810
*
@@ -17,5 +19,47 @@ enum LiteralStringEscapeCharacter: string {
1719
case LEFT_PARENTHESIS = '\(';
1820
case RIGHT_PARENTHESIS = '\)';
1921
case REVERSE_SOLIDUS = '\\';
20-
case CHARACTER_CODE = '\ddd';
22+
23+
public function getActualCharacter(): string {
24+
return match($this) {
25+
self::LINE_FEED => "\n",
26+
self::CARRIAGE_RETURN => "\r",
27+
self::HORIZONTAL_TAB => "\t",
28+
self::BACKSPACE => "\x08",
29+
self::FORM_FEED => "\x0C",
30+
self::LEFT_PARENTHESIS => "(",
31+
self::RIGHT_PARENTHESIS => ")",
32+
self::REVERSE_SOLIDUS => "\\",
33+
};
34+
}
35+
36+
/** @return array{0: list<string>, 1: list<string>} */
37+
private static function getReplacementSet(): array {
38+
$find = $replace = [];
39+
foreach (self::cases() as $case) {
40+
$find[] = $case->value;
41+
$replace[] = $case->getActualCharacter();
42+
}
43+
44+
return [$find, $replace];
45+
}
46+
47+
public static function unescapeCharacters(string $string): string {
48+
$string = str_replace("\\\n", '', $string); // Example 2, 7.3.4.2 newlines preceded by reverse solidus should be handled like single lines
49+
50+
[$find, $replace] = LiteralStringEscapeCharacter::getReplacementSet();
51+
52+
return preg_replace_callback(
53+
'/\\\\([0-7]{1,3})/',
54+
static function (array $matches) {
55+
$decimal = octdec($matches[1]);
56+
if (!is_int($decimal) || $decimal < 0 || $decimal > 255) {
57+
throw new ParseFailureException(sprintf('Invalid octal value "%s"', $matches[1]));
58+
}
59+
60+
return mb_chr($decimal);
61+
},
62+
str_replace($find, $replace, $string)
63+
) ?? throw new ParseFailureException();
64+
}
2165
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
<?php declare(strict_types=1);
2+
3+
namespace PrinsFrank\PdfParser\Tests\Unit\Document\Generic\Character;
4+
5+
use PHPUnit\Framework\Attributes\CoversClass;
6+
use PHPUnit\Framework\TestCase;
7+
use PrinsFrank\PdfParser\Document\Generic\Character\LiteralStringEscapeCharacter;
8+
9+
#[CoversClass(LiteralStringEscapeCharacter::class)]
10+
class LiteralStringEscapeCharacterTest extends TestCase {
11+
public function testUnescapeCharactersMultilineExample2(): void {
12+
static::assertSame(
13+
'These two strings are the same.',
14+
LiteralStringEscapeCharacter::unescapeCharacters(
15+
<<<EOD
16+
These \
17+
two strings \
18+
are the same.
19+
EOD
20+
)
21+
);
22+
}
23+
24+
public function testUnescapeNewlinesExample3(): void {
25+
static::assertSame(
26+
<<<EOD
27+
This string has an end-of-line at the end of it.
28+
29+
EOD,
30+
LiteralStringEscapeCharacter::unescapeCharacters(
31+
<<<EOD
32+
This string has an end-of-line at the end of it.
33+
34+
EOD,
35+
),
36+
);
37+
static::assertSame(
38+
<<<EOD
39+
So does this one.
40+
41+
EOD,
42+
LiteralStringEscapeCharacter::unescapeCharacters(
43+
<<<EOD
44+
So does this one.\n
45+
EOD,
46+
),
47+
);
48+
}
49+
50+
public function testUnescapeOctalCharactersExample4(): void {
51+
static::assertSame(
52+
'This string contains ¥two octal charactersÇ.',
53+
LiteralStringEscapeCharacter::unescapeCharacters('This string contains \245two octal characters\307.')
54+
);
55+
}
56+
57+
public function testUnescapeOctalCharactersExample5(): void {
58+
static::assertSame('+', LiteralStringEscapeCharacter::unescapeCharacters('\053'));
59+
static::assertSame('+', LiteralStringEscapeCharacter::unescapeCharacters('\53'));
60+
static::assertSame("\005" . '3', LiteralStringEscapeCharacter::unescapeCharacters('\0053'));
61+
}
62+
63+
public function testUnescapeCharacters(): void {
64+
static::assertSame("\n", LiteralStringEscapeCharacter::unescapeCharacters('\n'));
65+
static::assertSame("\r", LiteralStringEscapeCharacter::unescapeCharacters('\r'));
66+
static::assertSame("\t", LiteralStringEscapeCharacter::unescapeCharacters('\t'));
67+
static::assertSame("\x08", LiteralStringEscapeCharacter::unescapeCharacters('\b'));
68+
static::assertSame("\x0C", LiteralStringEscapeCharacter::unescapeCharacters('\f'));
69+
static::assertSame("(", LiteralStringEscapeCharacter::unescapeCharacters('\('));
70+
static::assertSame(")", LiteralStringEscapeCharacter::unescapeCharacters('\)'));
71+
static::assertSame("\\", LiteralStringEscapeCharacter::unescapeCharacters('\\'));
72+
static::assertSame("\000", LiteralStringEscapeCharacter::unescapeCharacters('\0'));
73+
static::assertSame("\005" . '35', LiteralStringEscapeCharacter::unescapeCharacters('\00535'));
74+
static::assertSame("\005" . '353', LiteralStringEscapeCharacter::unescapeCharacters('\005353'));
75+
}
76+
}

0 commit comments

Comments
 (0)