Skip to content

Commit 0019475

Browse files
committed
Use directly toUnicode CMap in embedded case for text extraction
DEVSIX-6147
1 parent e2455e3 commit 0019475

File tree

6 files changed

+98
-40
lines changed

6 files changed

+98
-40
lines changed

io/src/main/java/com/itextpdf/io/font/CMapEncoding.java

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -191,26 +191,8 @@ public int getCidCode(int cmapCode) {
191191
}
192192
}
193193

194-
public boolean containsCodeInCodeSpaceRange(int code, int length) {
195-
for (int i = 0; i < codeSpaceRanges.size(); i += 2) {
196-
if (length == codeSpaceRanges.get(i).length) {
197-
int mask = 0xff;
198-
int totalShift = 0;
199-
byte[] low = codeSpaceRanges.get(i);
200-
byte[] high = codeSpaceRanges.get(i + 1);
201-
boolean fitsIntoRange = true;
202-
for (int ind = length - 1; ind >= 0; ind--, totalShift += 8, mask <<= 8) {
203-
int actualByteValue = (code & mask) >> totalShift;
204-
if (!(actualByteValue >= (0xff & low[ind]) && actualByteValue <= (0xff & high[ind]))) {
205-
fitsIntoRange = false;
206-
}
207-
}
208-
if (fitsIntoRange) {
209-
return true;
210-
}
211-
}
212-
}
213-
return false;
194+
public List<byte[]> getCodeSpaceRanges() {
195+
return codeSpaceRanges;
214196
}
215197

216198
private static CMapCodepointToCid getCodeToCidCmap(String cmap, CMapCidToCodepoint cid2Code) {

io/src/main/java/com/itextpdf/io/font/cmap/CMapToUnicode.java

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,15 @@ This file is part of the iText (R) project.
2525
import com.itextpdf.io.logs.IoLogMessageConstant;
2626
import com.itextpdf.io.util.IntHashtable;
2727
import com.itextpdf.io.util.TextUtil;
28-
import org.slf4j.Logger;
29-
import org.slf4j.LoggerFactory;
3028

29+
import java.util.ArrayList;
3130
import java.util.Collections;
3231
import java.util.HashMap;
32+
import java.util.List;
3333
import java.util.Map;
3434
import java.util.Set;
35+
import org.slf4j.Logger;
36+
import org.slf4j.LoggerFactory;
3537

3638
/**
3739
* This class represents a CMap file.
@@ -42,7 +44,9 @@ public class CMapToUnicode extends AbstractCMap {
4244

4345
public static CMapToUnicode EmptyCMapToUnicodeMap = new CMapToUnicode(true);
4446

45-
private Map<Integer, char[]> byteMappings;
47+
private final Map<Integer, char[]> byteMappings;
48+
49+
private final List<byte[]> codeSpaceRanges = new ArrayList<>();
4650

4751
private CMapToUnicode(boolean emptyCMap) {
4852
byteMappings = Collections.<Integer, char[]>emptyMap();
@@ -60,6 +64,7 @@ public static CMapToUnicode getIdentity() {
6064
for (int i = 0; i < 65537; i++) {
6165
uni.addChar(i, TextUtil.convertFromUtf32(i));
6266
}
67+
uni.addCodeSpaceRange(new byte[] {0, 0}, new byte[] {(byte) 0xff, (byte) 0xff});
6368
return uni;
6469
}
6570

@@ -128,6 +133,22 @@ public Map<Integer, Integer> createReverseMapping() {
128133
return result;
129134
}
130135

136+
/**
137+
* Returns a list containing sequential pairs of code space beginning and endings:
138+
* (begincodespacerange1, endcodespacerange1, begincodespacerange2, endcodespacerange1, ...)
139+
*
140+
* @return list of {@code byte[]} that contain code space ranges
141+
*/
142+
public List<byte[]> getCodeSpaceRanges() {
143+
return codeSpaceRanges;
144+
}
145+
146+
@Override
147+
void addCodeSpaceRange(byte[] low, byte[] high) {
148+
codeSpaceRanges.add(low);
149+
codeSpaceRanges.add(high);
150+
}
151+
131152
private int convertToInt(char[] s) {
132153
int value = 0;
133154
for (int i = 0; i < s.length - 1; i++) {

kernel/src/main/java/com/itextpdf/kernel/font/DocTrueTypeFont.java

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,20 @@ This file is part of the iText (R) project.
2222
*/
2323
package com.itextpdf.kernel.font;
2424

25-
import com.itextpdf.io.font.FontProgram;
26-
import com.itextpdf.io.logs.IoLogMessageConstant;
2725
import com.itextpdf.io.font.FontEncoding;
26+
import com.itextpdf.io.font.FontProgram;
2827
import com.itextpdf.io.font.TrueTypeFont;
2928
import com.itextpdf.io.font.cmap.CMapToUnicode;
3029
import com.itextpdf.io.font.otf.Glyph;
30+
import com.itextpdf.io.logs.IoLogMessageConstant;
3131
import com.itextpdf.io.util.IntHashtable;
3232
import com.itextpdf.kernel.pdf.PdfArray;
3333
import com.itextpdf.kernel.pdf.PdfDictionary;
3434
import com.itextpdf.kernel.pdf.PdfName;
3535
import com.itextpdf.kernel.pdf.PdfNumber;
3636
import com.itextpdf.kernel.pdf.PdfStream;
3737
import com.itextpdf.kernel.pdf.PdfString;
38+
3839
import org.slf4j.Logger;
3940
import org.slf4j.LoggerFactory;
4041

@@ -110,12 +111,7 @@ static TrueTypeFont createFontProgram(PdfDictionary fontDictionary, CMapToUnicod
110111
fontProgram.avgWidth = 0;
111112
for (int cid : toUnicode.getCodes()) {
112113
final int width = widths.containsKey(cid) ? widths.get(cid) : defaultWidth;
113-
Glyph glyph = new Glyph(cid, width, toUnicode.lookup(cid));
114-
if (glyph.hasValidUnicode()) {
115-
fontProgram.unicodeToGlyph.put(glyph.getUnicode(), glyph);
116-
}
117-
fontProgram.codeToGlyph.put(cid, glyph);
118-
fontProgram.avgWidth += width;
114+
fontProgram.registerGlyph(cid, width, toUnicode.lookup(cid));
119115
}
120116
if (fontProgram.codeToGlyph.size() != 0) {
121117
fontProgram.avgWidth /= fontProgram.codeToGlyph.size();
@@ -267,4 +263,13 @@ static void fillFontDescriptor(DocTrueTypeFont font, PdfDictionary fontDesc) {
267263
}
268264
}
269265
}
266+
267+
private void registerGlyph(int cid, int width, char[] unicode) {
268+
Glyph glyph = new Glyph(cid, width, unicode);
269+
if (glyph.hasValidUnicode()) {
270+
this.unicodeToGlyph.put(glyph.getUnicode(), glyph);
271+
}
272+
this.codeToGlyph.put(cid, glyph);
273+
this.avgWidth += width;
274+
}
270275
}

kernel/src/main/java/com/itextpdf/kernel/font/PdfType0Font.java

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ public class PdfType0Font extends PdfFont {
9191
protected int cidFontType;
9292
protected char[] specificUnicodeDifferences;
9393

94+
private final CMapToUnicode embeddedToUnicode;
95+
9496
PdfType0Font(TrueTypeFont ttf, String cmap) {
9597
super();
9698
if (!PdfEncodings.IDENTITY_H.equals(cmap) && !PdfEncodings.IDENTITY_V.equals(cmap)) {
@@ -107,6 +109,7 @@ public class PdfType0Font extends PdfFont {
107109
cmapEncoding = new CMapEncoding(cmap);
108110
usedGlyphs = new TreeSet<>();
109111
cidFontType = CID_FONT_TYPE_2;
112+
embeddedToUnicode = null;
110113
if (ttf.isFontSpecific()) {
111114
specificUnicodeDifferences = new char[256];
112115
byte[] bytes = new byte[1];
@@ -135,6 +138,7 @@ public class PdfType0Font extends PdfFont {
135138
cmapEncoding = new CMapEncoding(cmap, uniMap);
136139
usedGlyphs = new TreeSet<>();
137140
cidFontType = CID_FONT_TYPE_0;
141+
embeddedToUnicode = null;
138142
}
139143

140144
PdfType0Font(PdfDictionary fontDictionary) {
@@ -151,8 +155,10 @@ public class PdfType0Font extends PdfFont {
151155
PdfObject toUnicode = fontDictionary.get(PdfName.ToUnicode);
152156
if (toUnicode == null) {
153157
toUnicodeCMap = FontUtil.parseUniversalToUnicodeCMap(ordering);
158+
embeddedToUnicode = null;
154159
} else {
155160
toUnicodeCMap = FontUtil.processToUnicode(toUnicode);
161+
embeddedToUnicode = toUnicodeCMap;
156162
}
157163

158164
if (cmap.isName() && (PdfEncodings.IDENTITY_H.equals(((PdfName) cmap).getValue()) ||
@@ -555,6 +561,11 @@ public GlyphLine decodeIntoGlyphLine(PdfString characterCodes) {
555561
public boolean appendDecodedCodesToGlyphsList(List<Glyph> list, PdfString characterCodes) {
556562
boolean allCodesDecoded = true;
557563

564+
final boolean isToUnicodeEmbedded = embeddedToUnicode != null;
565+
final CMapEncoding cmap = getCmap();
566+
final FontProgram fontProgram = getFontProgram();
567+
final List<byte[]> codeSpaceRanges = isToUnicodeEmbedded ? embeddedToUnicode.getCodeSpaceRanges() : cmap.getCodeSpaceRanges();
568+
558569
String charCodesSequence = characterCodes.getValue();
559570
// A sequence of one or more bytes shall be extracted from the string and matched against the codespace
560571
// ranges in the CMap. That is, the first byte shall be matched against 1-byte codespace ranges; if no match is
@@ -568,13 +579,18 @@ public boolean appendDecodedCodesToGlyphsList(List<Glyph> list, PdfString charac
568579
for (int codeLength = 1; codeLength <= MAX_CID_CODE_LENGTH && i + codeLength <= charCodesSequence.length();
569580
codeLength++) {
570581
code = (code << 8) + charCodesSequence.charAt(i + codeLength - 1);
571-
if (!getCmap().containsCodeInCodeSpaceRange(code, codeLength)) {
572-
continue;
573-
} else {
582+
583+
if (PdfType0Font.containsCodeInCodeSpaceRange(codeSpaceRanges, code, codeLength)) {
574584
codeSpaceMatchedLength = codeLength;
585+
} else {
586+
continue;
575587
}
576-
int glyphCode = getCmap().getCidCode(code);
577-
glyph = getFontProgram().getGlyphByCode(glyphCode);
588+
589+
// According to paragraph 9.10.2 of PDF Specification ISO 32000-2, if toUnicode is embedded, it is
590+
// necessary to use it to map directly code points to unicode. If not embedded, use CMap to map code
591+
// points to CIDs and then CIDFont to map CIDs to unicode.
592+
int glyphCode = isToUnicodeEmbedded ? code : cmap.getCidCode(code);
593+
glyph = fontProgram.getGlyphByCode(glyphCode);
578594
if (glyph != null) {
579595
i += codeLength - 1;
580596
break;
@@ -594,11 +610,11 @@ public boolean appendDecodedCodesToGlyphsList(List<Glyph> list, PdfString charac
594610
}
595611
i += codeSpaceMatchedLength - 1;
596612
}
597-
if (glyph != null && glyph.getChars() != null) {
598-
list.add(glyph);
599-
} else {
600-
list.add(new Glyph(0, getFontProgram().getGlyphByCode(0).getWidth(), -1));
613+
if (glyph == null || glyph.getChars() == null) {
614+
list.add(new Glyph(0, fontProgram.getGlyphByCode(0).getWidth(), -1));
601615
allCodesDecoded = false;
616+
} else {
617+
list.add(glyph);
602618
}
603619
}
604620
return allCodesDecoded;
@@ -674,6 +690,28 @@ private static String getOrdering(PdfDictionary cidFont) {
674690
return cidinfo.containsKey(PdfName.Ordering) ? cidinfo.get(PdfName.Ordering).toString() : null;
675691
}
676692

693+
private static boolean containsCodeInCodeSpaceRange(List<byte[]> codeSpaceRanges, int code, int length) {
694+
for (int i = 0; i < codeSpaceRanges.size(); i += 2) {
695+
if (length == codeSpaceRanges.get(i).length) {
696+
int mask = 0xff;
697+
int totalShift = 0;
698+
byte[] low = codeSpaceRanges.get(i);
699+
byte[] high = codeSpaceRanges.get(i + 1);
700+
boolean fitsIntoRange = true;
701+
for (int ind = length - 1; ind >= 0; ind--, totalShift += 8, mask <<= 8) {
702+
int actualByteValue = (code & mask) >> totalShift;
703+
if (!(actualByteValue >= (0xff & low[ind]) && actualByteValue <= (0xff & high[ind]))) {
704+
fitsIntoRange = false;
705+
}
706+
}
707+
if (fitsIntoRange) {
708+
return true;
709+
}
710+
}
711+
}
712+
return false;
713+
}
714+
677715
private void flushFontData() {
678716
if (cidFontType == CID_FONT_TYPE_0) {
679717
getPdfObject().put(PdfName.Type, PdfName.Font);

kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/TextRenderInfoTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,18 @@ public void testDoubleMappedCharacterExtraction() throws IOException {
113113
Assert.assertEquals(expectedResult, result);
114114
}
115115

116+
@Test
117+
public void testEmbeddedIdentityToUnicodeTest() throws IOException {
118+
String inFile = "embedded_identity_to_unicode.pdf";
119+
String expectedResult = "Regular hyphen [\u002d] and non-breaking hyphen [\u2011] (both CID 14)\n"
120+
+ "Turtle kyuujitai [\u9f9c] and turtle radical [\u2fd4] (both CID 7472)";
121+
122+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(SOURCE_FOLDER + inFile));
123+
ITextExtractionStrategy start = new SimpleTextExtractionStrategy();
124+
125+
String result = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(FIRST_PAGE), start).trim();
126+
Assert.assertEquals(expectedResult, result);
127+
}
116128

117129
private static class TextPositionEventListener implements IEventListener {
118130
List<LineSegment> lineSegments = new ArrayList<>();

0 commit comments

Comments
 (0)