Skip to content

Commit 38e2a46

Browse files
authored
Implement toUnicodeCMap unicode mappings for string literals (#106)
1 parent a3d47d6 commit 38e2a46

File tree

2 files changed

+31
-31
lines changed

2 files changed

+31
-31
lines changed

src/Document/ContentStream/PositionedText/PositionedTextElement.php

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ public function getText(?Font $font): string {
2525
foreach ($matches as $match) {
2626
if (str_starts_with($match['chars'], '(') && str_ends_with($match['chars'], ')')) {
2727
$chars = LiteralStringEscapeCharacter::unescapeCharacters(substr($match['chars'], 1, -1));
28-
if ($font !== null && ($encoding = $font->getEncoding()) !== null) {
28+
if (($encoding = $font?->getEncoding()) !== null) {
2929
$chars = $encoding->decodeString($chars);
30+
} elseif (($toUnicodeCMap = $font?->getToUnicodeCMap() ?? $font?->getToUnicodeCMapDescendantFont()) !== null) {
31+
$chars = $toUnicodeCMap->textToUnicode(bin2hex($chars));
3032
}
3133

3234
$string .= $chars;
@@ -35,7 +37,14 @@ public function getText(?Font $font): string {
3537
throw new ParseFailureException('No font available');
3638
}
3739

38-
$string .= $font->toUnicode(substr($match['chars'], 1, -1));
40+
$chars = substr($match['chars'], 1, -1);
41+
if (($toUnicodeCMap = $font->getToUnicodeCMap() ?? $font->getToUnicodeCMapDescendantFont()) !== null) {
42+
$string .= $toUnicodeCMap->textToUnicode($chars);
43+
} elseif (($encoding = $font->getEncoding()) !== null) {
44+
$string .= $encoding->decodeString(implode('', array_map(fn (string $character) => mb_chr((int) hexdec($character)), str_split($chars, 2))));
45+
} else {
46+
throw new ParseFailureException('Unable to use CMap or decode string to retrieve characters for text object');
47+
}
3948
} else {
4049
throw new ParseFailureException(sprintf('Unrecognized character group format "%s"', $match['chars']));
4150
}

src/Document/Object/Decorator/Font.php

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,26 @@ public function getToUnicodeCMap(): ?ToUnicodeCMap {
8282
return $this->toUnicodeCMap = ToUnicodeCMapParser::parse($stream, 0, $stream->getSizeInBytes());
8383
}
8484

85+
public function getToUnicodeCMapDescendantFont(): ?ToUnicodeCMap {
86+
foreach ($this->getDescendantFonts() as $descendantFont) {
87+
$fontDictionary = $descendantFont instanceof Dictionary ? $descendantFont : $descendantFont->getDictionary();
88+
89+
if (($CIDSystemInfo = $fontDictionary->getValueForKey(DictionaryKey::CIDSYSTEM_INFO, Dictionary::class)) !== null) {
90+
$fontResource = RegistryOrchestrator::getForRegistryOrderingSupplement(
91+
$CIDSystemInfo->getValueForKey(DictionaryKey::REGISTRY, TextStringValue::class) ?? throw new ParseFailureException(),
92+
$CIDSystemInfo->getValueForKey(DictionaryKey::ORDERING, TextStringValue::class) ?? throw new ParseFailureException(),
93+
$CIDSystemInfo->getValueForKey(DictionaryKey::SUPPLEMENT, IntegerValue::class) ?? throw new ParseFailureException(),
94+
);
95+
96+
if ($fontResource !== null) {
97+
return $fontResource->getToUnicodeCMap();
98+
}
99+
}
100+
}
101+
102+
return null;
103+
}
104+
85105
/** @throws PdfParserException */
86106
public function getFirstChar(): ?int {
87107
return $this->getDictionary()
@@ -225,33 +245,4 @@ public function getFontDescriptor(): ?ReferenceValue {
225245
return $this->getDictionary()
226246
->getValueForKey(DictionaryKey::FONT_DESCRIPTOR, ReferenceValue::class);
227247
}
228-
229-
/** @throws PdfParserException */
230-
public function toUnicode(string $characterGroup): string {
231-
$toUnicodeCMap = $this->getToUnicodeCMap();
232-
if ($toUnicodeCMap !== null) {
233-
return $toUnicodeCMap->textToUnicode($characterGroup);
234-
}
235-
236-
$descendantFonts = $this->getDictionary()->getObjectsForReference($this->document, DictionaryKey::DESCENDANT_FONTS, Font::class);
237-
foreach ($descendantFonts as $descendantFont) {
238-
if (($CIDSystemInfo = $descendantFont->getDictionary()->getValueForKey(DictionaryKey::CIDSYSTEM_INFO, Dictionary::class)) !== null) {
239-
$fontResource = RegistryOrchestrator::getForRegistryOrderingSupplement(
240-
$CIDSystemInfo->getValueForKey(DictionaryKey::REGISTRY, TextStringValue::class) ?? throw new ParseFailureException(),
241-
$CIDSystemInfo->getValueForKey(DictionaryKey::ORDERING, TextStringValue::class) ?? throw new ParseFailureException(),
242-
$CIDSystemInfo->getValueForKey(DictionaryKey::SUPPLEMENT, IntegerValue::class) ?? throw new ParseFailureException(),
243-
);
244-
245-
if ($fontResource !== null) {
246-
return $fontResource->getToUnicodeCMap()->textToUnicode($characterGroup);
247-
}
248-
}
249-
}
250-
251-
if (($encoding = $this->getEncoding()) !== null) {
252-
return $encoding->decodeString(implode('', array_map(fn (string $character) => mb_chr((int) hexdec($character)), str_split($characterGroup, 2))));
253-
}
254-
255-
throw new ParseFailureException('No ToUnicodeCMap or encoding information available for this font');
256-
}
257248
}

0 commit comments

Comments
 (0)