Skip to content

Commit 1c6199f

Browse files
committed
Fix text extraction when ToUnicode cmap contains not default codespace ranges
DEVSIX-8035
1 parent bc6666f commit 1c6199f

File tree

4 files changed

+34
-10
lines changed

4 files changed

+34
-10
lines changed

kernel/src/main/java/com/itextpdf/kernel/font/PdfType0Font.java

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -744,27 +744,32 @@ private static String getOrdering(PdfDictionary cidFont) {
744744
}
745745

746746
private static boolean containsCodeInCodeSpaceRange(List<byte[]> codeSpaceRanges, int code, int length) {
747+
long unsignedCode = code & 0xffffffff;
747748
for (int i = 0; i < codeSpaceRanges.size(); i += 2) {
748749
if (length == codeSpaceRanges.get(i).length) {
749-
int mask = 0xff;
750-
int totalShift = 0;
751750
byte[] low = codeSpaceRanges.get(i);
752751
byte[] high = codeSpaceRanges.get(i + 1);
753-
boolean fitsIntoRange = true;
754-
for (int ind = length - 1; ind >= 0; ind--, totalShift += 8, mask <<= 8) {
755-
int actualByteValue = (code & mask) >> totalShift;
756-
if (!(actualByteValue >= (0xff & low[ind]) && actualByteValue <= (0xff & high[ind]))) {
757-
fitsIntoRange = false;
758-
}
759-
}
760-
if (fitsIntoRange) {
752+
long lowValue = bytesToLong(low);
753+
long highValue = bytesToLong(high);
754+
if (unsignedCode >= lowValue && unsignedCode <= highValue) {
761755
return true;
762756
}
763757
}
764758
}
765759
return false;
766760
}
767761

762+
private static long bytesToLong(byte[] bytes) {
763+
long res = 0;
764+
int shift = 0;
765+
for (int i = bytes.length - 1; i >= 0; --i) {
766+
res += (bytes[i] & 0xff) << shift;
767+
shift += 8;
768+
}
769+
770+
return res;
771+
}
772+
768773
private void flushFontData() {
769774
if (cidFontType == CID_FONT_TYPE_0) {
770775
getPdfObject().put(PdfName.Type, PdfName.Font);

kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/PdfTextExtractorTest.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,4 +132,23 @@ public void shortOctalDataAsTextTest() throws IOException {
132132
Assert.assertEquals("EC", PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
133133
}
134134
}
135+
136+
@Test
137+
public void notDefaultCodespacesCyrillicTest() throws IOException {
138+
String inFile = sourceFolder + "notDefaultCodespacesCyrillic.pdf";
139+
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
140+
String extractedText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1));
141+
Assert.assertTrue(extractedText.contains("бронирование"));
142+
Assert.assertTrue(extractedText.contains("From"));
143+
}
144+
}
145+
146+
@Test
147+
public void notDefaultCodespacesChineseTest() throws IOException {
148+
String inFile = sourceFolder + "notDefaultCodespacesChinese.pdf";
149+
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
150+
String extractedText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1));
151+
Assert.assertTrue(extractedText.contains("L3B 廠: 新竹科學工業園區新竹市東區力行二路 1 號"));
152+
}
153+
}
135154
}

0 commit comments

Comments
 (0)