Skip to content

Commit b9ec019

Browse files
committed
Fix problem with text extraction in case of non-identity CMap
Extract codepoint to CID CMap directly, without losing mappings DEVSIX-6147
1 parent 09af7a6 commit b9ec019

File tree

10 files changed

+183
-40
lines changed

10 files changed

+183
-40
lines changed

io/src/main/java/com/itextpdf/io/font/CMapEncoding.java

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,34 +22,32 @@ This file is part of the iText (R) project.
2222
*/
2323
package com.itextpdf.io.font;
2424

25-
import com.itextpdf.io.logs.IoLogMessageConstant;
26-
import com.itextpdf.io.font.cmap.CMapCidByte;
27-
import com.itextpdf.io.font.cmap.CMapCidUni;
25+
import com.itextpdf.io.font.cmap.CMapCidToCodepoint;
26+
import com.itextpdf.io.font.cmap.CMapCodepointToCid;
2827
import com.itextpdf.io.font.cmap.CMapLocationFromBytes;
2928
import com.itextpdf.io.font.cmap.CMapParser;
29+
import com.itextpdf.io.logs.IoLogMessageConstant;
3030
import com.itextpdf.io.source.ByteBuffer;
31-
import com.itextpdf.io.util.IntHashtable;
32-
import org.slf4j.LoggerFactory;
3331

3432
import java.io.IOException;
3533
import java.util.Arrays;
3634
import java.util.List;
3735
import java.util.Objects;
36+
import org.slf4j.LoggerFactory;
3837

3938
public class CMapEncoding {
4039

4140
private static final List<byte[]> IDENTITY_H_V_CODESPACE_RANGES = Arrays.asList(new byte[] {0, 0}, new byte[] {(byte)0xff, (byte)0xff});
4241

43-
private String cmap;
42+
private final String cmap;
4443
private String uniMap;
4544

4645
// true if CMap is Identity-H/V
4746
private boolean isDirect;
4847

49-
private CMapCidUni cid2Uni;
50-
private CMapCidByte cid2Code;
48+
private CMapCidToCodepoint cid2Code;
5149

52-
private IntHashtable code2Cid;
50+
private CMapCodepointToCid code2Cid;
5351

5452
private List<byte[]> codeSpaceRanges;
5553

@@ -76,22 +74,21 @@ public CMapEncoding(String cmap, String uniMap) {
7674
this.cmap = cmap;
7775
this.uniMap = uniMap;
7876
if (cmap.equals(PdfEncodings.IDENTITY_H) || cmap.equals(PdfEncodings.IDENTITY_V)) {
79-
cid2Uni = FontCache.getCid2UniCmap(uniMap);
8077
isDirect = true;
8178
this.codeSpaceRanges = IDENTITY_H_V_CODESPACE_RANGES;
8279
} else {
83-
cid2Code = FontCache.getCid2Byte(cmap);
84-
code2Cid = cid2Code.getReversMap();
80+
cid2Code = FontCache.getCidToCodepointCmap(cmap);
81+
code2Cid = CMapEncoding.getCodeToCidCmap(cmap, cid2Code);
8582
this.codeSpaceRanges = cid2Code.getCodeSpaceRanges();
8683
}
8784
}
8885

8986
public CMapEncoding(String cmap, byte[] cmapBytes) {
9087
this.cmap = cmap;
91-
cid2Code = new CMapCidByte();
88+
cid2Code = new CMapCidToCodepoint();
9289
try {
9390
CMapParser.parseCid(cmap, cid2Code, new CMapLocationFromBytes(cmapBytes));
94-
code2Cid = cid2Code.getReversMap();
91+
code2Cid = CMapEncoding.getCodeToCidCmap(cmap, cid2Code);
9592
this.codeSpaceRanges = cid2Code.getCodeSpaceRanges();
9693
} catch (IOException e) {
9794
LoggerFactory.getLogger(getClass()).error(IoLogMessageConstant.FAILED_TO_PARSE_ENCODING_STREAM);
@@ -190,7 +187,7 @@ public int getCidCode(int cmapCode) {
190187
if (isDirect) {
191188
return cmapCode;
192189
} else {
193-
return code2Cid.get(cmapCode);
190+
return code2Cid.lookup(cmapCode);
194191
}
195192
}
196193

@@ -215,4 +212,13 @@ public boolean containsCodeInCodeSpaceRange(int code, int length) {
215212
}
216213
return false;
217214
}
215+
216+
private static CMapCodepointToCid getCodeToCidCmap(String cmap, CMapCidToCodepoint cid2Code) {
217+
try {
218+
return FontCache.getCodepointToCidCmap(cmap);
219+
} catch (com.itextpdf.io.exceptions.IOException ex) {
220+
// if not found, fall back to reversing
221+
return new CMapCodepointToCid(cid2Code);
222+
}
223+
}
218224
}

io/src/main/java/com/itextpdf/io/font/FontCache.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,16 @@ This file is part of the iText (R) project.
2525
import com.itextpdf.io.exceptions.IOException;
2626
import com.itextpdf.io.font.cmap.AbstractCMap;
2727
import com.itextpdf.io.font.cmap.CMapByteCid;
28-
import com.itextpdf.io.font.cmap.CMapCidByte;
28+
import com.itextpdf.io.font.cmap.CMapCidToCodepoint;
2929
import com.itextpdf.io.font.cmap.CMapCidUni;
30+
import com.itextpdf.io.font.cmap.CMapCodepointToCid;
3031
import com.itextpdf.io.font.cmap.CMapLocationResource;
3132
import com.itextpdf.io.font.cmap.CMapParser;
3233
import com.itextpdf.io.font.cmap.CMapUniCid;
3334
import com.itextpdf.io.font.constants.FontResources;
3435
import com.itextpdf.io.util.IntHashtable;
3536
import com.itextpdf.io.util.ResourceUtil;
37+
3638
import java.io.InputStream;
3739
import java.util.HashMap;
3840
import java.util.HashSet;
@@ -133,20 +135,23 @@ public static CMapCidUni getCid2UniCmap(String uniMap) {
133135
}
134136

135137
public static CMapUniCid getUni2CidCmap(String uniMap) {
136-
CMapUniCid uniCid = new CMapUniCid();
137-
return parseCmap(uniMap, uniCid);
138+
return parseCmap(uniMap, new CMapUniCid());
138139
}
139140

140141
public static CMapByteCid getByte2CidCmap(String cmap) {
141142
CMapByteCid uniCid = new CMapByteCid();
142143
return parseCmap(cmap, uniCid);
143144
}
144145

145-
public static CMapCidByte getCid2Byte(String cmap) {
146-
CMapCidByte cidByte = new CMapCidByte();
146+
public static CMapCidToCodepoint getCidToCodepointCmap(String cmap) {
147+
CMapCidToCodepoint cidByte = new CMapCidToCodepoint();
147148
return parseCmap(cmap, cidByte);
148149
}
149150

151+
public static CMapCodepointToCid getCodepointToCidCmap(String uniMap) {
152+
return parseCmap(uniMap, new CMapCodepointToCid());
153+
}
154+
150155
/**
151156
* Clears the cache by removing fonts that were added via {@link #saveFont(FontProgram, String)}.
152157
* <p>

io/src/main/java/com/itextpdf/io/font/cmap/CMapCidByte.java renamed to io/src/main/java/com/itextpdf/io/font/cmap/CMapCidToCodepoint.java

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,11 @@ This file is part of the iText (R) project.
2929
import java.util.List;
3030
import java.util.Map;
3131

32-
/**
33-
* @author psoares
34-
*/
35-
public class CMapCidByte extends AbstractCMap {
32+
public class CMapCidToCodepoint extends AbstractCMap {
33+
private static final byte[] EMPTY = {};
3634

37-
private Map<Integer, byte[]> map = new HashMap<>();
38-
private final byte[] EMPTY = {};
39-
private List<byte[]> codeSpaceRanges = new ArrayList<>();
35+
private final Map<Integer, byte[]> map = new HashMap<>();
36+
private final List<byte[]> codeSpaceRanges = new ArrayList<>();
4037

4138
@Override
4239
void addChar(String mark, CMapObject code) {
@@ -57,14 +54,14 @@ public byte[] lookup(int cid) {
5754

5855
public IntHashtable getReversMap() {
5956
IntHashtable code2cid = new IntHashtable(map.size());
60-
for (int cid : map.keySet()) {
61-
byte[] bytes = map.get(cid);
57+
for (Map.Entry<Integer, byte[]> entry : map.entrySet()) {
58+
byte[] bytes = entry.getValue();
6259
int byteCode = 0;
6360
for (byte b: bytes) {
6461
byteCode <<= 8;
6562
byteCode += b & 0xff;
6663
}
67-
code2cid.put(byteCode, cid);
64+
code2cid.put(byteCode, entry.getKey());
6865
}
6966
return code2cid;
7067
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package com.itextpdf.io.font.cmap;
2+
3+
import com.itextpdf.io.util.IntHashtable;
4+
5+
/**
6+
* Class represents real codepoint-CID mapping without any additional manipulation.
7+
*
8+
* <p>
9+
* See {@link CMapCidToCodepoint} for CID-codepoint representation.
10+
*/
11+
public class CMapCodepointToCid extends AbstractCMap {
12+
13+
private final IntHashtable map;
14+
15+
public CMapCodepointToCid() {
16+
map = new IntHashtable();
17+
}
18+
19+
public CMapCodepointToCid(CMapCidToCodepoint reverseMap) {
20+
map = reverseMap.getReversMap();
21+
}
22+
23+
@Override
24+
void addChar(String mark, CMapObject code) {
25+
if (code.isNumber()) {
26+
byte[] ser = decodeStringToByte(mark);
27+
int byteCode = 0;
28+
for (byte b: ser) {
29+
byteCode <<= 8;
30+
byteCode += b & 0xff;
31+
}
32+
map.put(byteCode, (int) code.getValue());
33+
}
34+
}
35+
36+
public int lookup(int codepoint) {
37+
return this.map.get(codepoint);
38+
}
39+
}

io/src/test/java/com/itextpdf/io/font/FontCacheNoFontAsianTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ public void getByte2CidCMapNoFontAsian() {
149149
public void getCid2ByteCMapNoFontAsian() {
150150
// Without font-asian module in the class path
151151
// no CMap can be found.
152-
Assert.assertThrows(IOException.class, () -> FontCache.getCid2Byte("78ms-RKSJ-H"));
152+
Assert.assertThrows(IOException.class, () -> FontCache.getCidToCodepointCmap("78ms-RKSJ-H"));
153153
}
154154

155155
private static class FontProgramMock extends FontProgram {
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package com.itextpdf.io.font.cmap;
2+
3+
import com.itextpdf.io.util.IntHashtable;
4+
import com.itextpdf.test.ExtendedITextTest;
5+
import com.itextpdf.test.annotations.type.UnitTest;
6+
7+
import java.util.List;
8+
import org.junit.Assert;
9+
import org.junit.Test;
10+
import org.junit.experimental.categories.Category;
11+
12+
@Category(UnitTest.class)
13+
public class CMapCidToCodepointTest extends ExtendedITextTest {
14+
@Test
15+
public void addCharAndLookupTest() {
16+
CMapCidToCodepoint cidToCode = new CMapCidToCodepoint();
17+
Assert.assertArrayEquals(new byte[0], cidToCode.lookup(14));
18+
cidToCode.addChar(new String(new byte[] {32, 17}), new CMapObject(CMapObject.NUMBER, 14));
19+
cidToCode.addChar(new String(new byte[] {32, 19}), new CMapObject(CMapObject.STRING, "some text"));
20+
21+
Assert.assertArrayEquals(new byte[] {32, 17}, cidToCode.lookup(14));
22+
Assert.assertArrayEquals(new byte[0], cidToCode.lookup(1));
23+
}
24+
25+
@Test
26+
public void getReverseMapTest() {
27+
CMapCidToCodepoint cidToCode = new CMapCidToCodepoint();
28+
cidToCode.addChar(new String(new byte[] {32, 17}), new CMapObject(CMapObject.NUMBER, 14));
29+
cidToCode.addChar(new String(new byte[] {32, 18}), new CMapObject(CMapObject.NUMBER, 15));
30+
31+
IntHashtable table = cidToCode.getReversMap();
32+
Assert.assertEquals(2, table.size());
33+
Assert.assertEquals(14, table.get(8209));
34+
Assert.assertEquals(15, table.get(8210));
35+
}
36+
37+
@Test
38+
public void addAndGetCodeSpaceRangeTest() {
39+
CMapCidToCodepoint cidToCode = new CMapCidToCodepoint();
40+
Assert.assertTrue(cidToCode.getCodeSpaceRanges().isEmpty());
41+
42+
cidToCode.addCodeSpaceRange(new byte[] {11}, new byte[] {12, 13});
43+
cidToCode.addCodeSpaceRange(null, new byte[] {});
44+
List<byte[]> codeSpaceRanges = cidToCode.getCodeSpaceRanges();
45+
Assert.assertEquals(4, codeSpaceRanges.size());
46+
Assert.assertArrayEquals(new byte[] {11}, codeSpaceRanges.get(0));
47+
Assert.assertArrayEquals(new byte[] {12, 13}, codeSpaceRanges.get(1));
48+
Assert.assertNull(codeSpaceRanges.get(2));
49+
Assert.assertArrayEquals(new byte[] {}, codeSpaceRanges.get(3));
50+
}
51+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package com.itextpdf.io.font.cmap;
2+
3+
import com.itextpdf.test.ExtendedITextTest;
4+
import com.itextpdf.test.annotations.type.UnitTest;
5+
6+
import org.junit.Assert;
7+
import org.junit.Test;
8+
import org.junit.experimental.categories.Category;
9+
10+
@Category(UnitTest.class)
11+
public class CMapCodepointToCidTest extends ExtendedITextTest {
12+
@Test
13+
public void reverseConstructorTest() {
14+
CMapCidToCodepoint cidToCode = new CMapCidToCodepoint();
15+
cidToCode.addChar(new String(new byte[] {32, 17}), new CMapObject(CMapObject.NUMBER, 14));
16+
cidToCode.addChar(new String(new byte[] {32, 18}), new CMapObject(CMapObject.NUMBER, 15));
17+
18+
CMapCodepointToCid codeToCid = new CMapCodepointToCid(cidToCode);
19+
Assert.assertEquals(14, codeToCid.lookup(8209));
20+
Assert.assertEquals(15, codeToCid.lookup(8210));
21+
}
22+
23+
@Test
24+
public void addCharAndLookupTest() {
25+
CMapCodepointToCid codeToCid = new CMapCodepointToCid();
26+
Assert.assertEquals(0, codeToCid.lookup(8209));
27+
28+
codeToCid.addChar(new String(new byte[] {32, 17}), new CMapObject(CMapObject.NUMBER, 14));
29+
codeToCid.addChar(new String(new byte[] {32, 19}), new CMapObject(CMapObject.STRING, "some text"));
30+
31+
Assert.assertEquals(14, codeToCid.lookup(8209));
32+
Assert.assertEquals(0, codeToCid.lookup(1));
33+
}
34+
}

kernel/src/main/java/com/itextpdf/kernel/font/FontUtil.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,14 @@ This file is part of the iText (R) project.
2222
*/
2323
package com.itextpdf.kernel.font;
2424

25-
import com.itextpdf.io.logs.IoLogMessageConstant;
2625
import com.itextpdf.io.font.FontCache;
2726
import com.itextpdf.io.font.PdfEncodings;
2827
import com.itextpdf.io.font.cmap.CMapLocationFromBytes;
2928
import com.itextpdf.io.font.cmap.CMapParser;
3029
import com.itextpdf.io.font.cmap.CMapToUnicode;
3130
import com.itextpdf.io.font.cmap.CMapUniCid;
3231
import com.itextpdf.io.font.cmap.ICMapLocation;
32+
import com.itextpdf.io.logs.IoLogMessageConstant;
3333
import com.itextpdf.io.util.IntHashtable;
3434
import com.itextpdf.kernel.pdf.PdfArray;
3535
import com.itextpdf.kernel.pdf.PdfName;
@@ -40,7 +40,6 @@ This file is part of the iText (R) project.
4040
import java.security.SecureRandom;
4141
import java.util.Arrays;
4242
import java.util.HashMap;
43-
4443
import org.slf4j.Logger;
4544
import org.slf4j.LoggerFactory;
4645

@@ -93,9 +92,6 @@ static CMapToUnicode getToUnicodeFromUniMap(String uniMap) {
9392
toUnicode = CMapToUnicode.getIdentity();
9493
} else {
9594
CMapUniCid uni = FontCache.getUni2CidCmap(uniMap);
96-
if (uni == null) {
97-
return null;
98-
}
9995
toUnicode = uni.exportToUnicode();
10096
}
10197
uniMaps.put(uniMap, toUnicode);

0 commit comments

Comments
 (0)