Skip to content

Commit 1216089

Browse files
committed
Use /ToUnicode mappings to several chars during text extraction from simple fonts
Refactor existing PdfFont#decode(PdfString) and PdfFont#getContentWidth(PdfString) methods DEVSIX-1681 (cherry picked from commit 9a3157e)
1 parent 408bace commit 1216089

File tree

10 files changed

+2106
-1111
lines changed

10 files changed

+2106
-1111
lines changed

kernel/src/main/java/com/itextpdf/kernel/font/PdfSimpleFont.java

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ This file is part of the iText (R) project.
4949
import com.itextpdf.io.font.FontNames;
5050
import com.itextpdf.io.font.FontProgram;
5151
import com.itextpdf.io.font.PdfEncodings;
52+
import com.itextpdf.io.font.cmap.CMapToUnicode;
5253
import com.itextpdf.io.font.otf.Glyph;
5354
import com.itextpdf.io.font.otf.GlyphLine;
5455
import com.itextpdf.io.util.ArrayUtil;
@@ -62,6 +63,7 @@ This file is part of the iText (R) project.
6263
import com.itextpdf.kernel.pdf.PdfString;
6364

6465
import java.util.ArrayList;
66+
import java.util.Arrays;
6567
import java.util.List;
6668

6769
public abstract class PdfSimpleFont<T extends FontProgram> extends PdfFont {
@@ -79,8 +81,15 @@ public abstract class PdfSimpleFont<T extends FontProgram> extends PdfFont {
7981
*/
8082
protected byte[] shortTag = new byte[256];
8183

84+
/**
85+
* Currently only exists for the fonts that are parsed from the document.
86+
* In the future, we might provide possibility to add custom mappings after a font has been created from a font file.
87+
*/
88+
protected CMapToUnicode toUnicode;
89+
8290
protected PdfSimpleFont(PdfDictionary fontDictionary) {
8391
super(fontDictionary);
92+
toUnicode = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
8493
}
8594

8695
protected PdfSimpleFont() {
@@ -254,22 +263,8 @@ public void writeText(String text, PdfOutputStream stream) {
254263
}
255264

256265
@Override
257-
// TODO refactor using decodeIntoGlyphLine?
258266
public String decode(PdfString content) {
259-
byte[] contentBytes = content.getValueBytes();
260-
StringBuilder builder = new StringBuilder(contentBytes.length);
261-
for (byte b : contentBytes) {
262-
int uni = fontEncoding.getUnicode(b & 0xff);
263-
if (uni > -1) {
264-
builder.append((char) (int) uni);
265-
} else if (fontEncoding.getBaseEncoding() == null) {
266-
Glyph glyph = fontProgram.getGlyphByCode(b & 0xff);
267-
if (glyph != null && glyph.getChars() != null) {
268-
builder.append(glyph.getChars());
269-
}
270-
}
271-
}
272-
return builder.toString();
267+
return decodeIntoGlyphLine(content).toString();
273268
}
274269

275270
/**
@@ -281,12 +276,20 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
281276
List<Glyph> glyphs = new ArrayList<>(contentBytes.length);
282277
for (byte b : contentBytes) {
283278
int code = b & 0xff;
284-
int uni = fontEncoding.getUnicode(code);
285279
Glyph glyph = null;
286-
if (uni > -1) {
287-
glyph = getGlyph(uni);
288-
} else if (fontEncoding.getBaseEncoding() == null) {
289-
glyph = fontProgram.getGlyphByCode(code);
280+
if (toUnicode != null && toUnicode.lookup(code) != null && (glyph = fontProgram.getGlyphByCode(code)) != null) {
281+
if (!Arrays.equals(toUnicode.lookup(code), glyph.getChars())) {
282+
// Copy the glyph because the original one may be reused (e.g. standard Helvetica font program)
283+
glyph = new Glyph(glyph);
284+
glyph.setChars(toUnicode.lookup(code));
285+
}
286+
} else {
287+
int uni = fontEncoding.getUnicode(code);
288+
if (uni > -1) {
289+
glyph = getGlyph(uni);
290+
} else if (fontEncoding.getBaseEncoding() == null) {
291+
glyph = fontProgram.getGlyphByCode(code);
292+
}
290293
}
291294
if (glyph != null) {
292295
glyphs.add(glyph);
@@ -296,19 +299,11 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
296299
}
297300

298301
@Override
299-
// TODO refactor using decodeIntoGlyphLine?
300302
public float getContentWidth(PdfString content) {
301303
float width = 0;
302-
byte[] contentBytes = content.getValueBytes();
303-
for (byte b : contentBytes) {
304-
Glyph glyph = null;
305-
int uni = fontEncoding.getUnicode(b & 0xff);
306-
if (uni > -1) {
307-
glyph = getGlyph(uni);
308-
} else if (fontEncoding.getBaseEncoding() == null) {
309-
glyph = fontProgram.getGlyphByCode(b & 0xff);
310-
}
311-
width += glyph != null ? glyph.getWidth() : 0;
304+
GlyphLine glyphLine = decodeIntoGlyphLine(content);
305+
for (int i = glyphLine.start; i < glyphLine.end; i++) {
306+
width += glyphLine.get(i).getWidth();
312307
}
313308
return width;
314309
}

kernel/src/main/java/com/itextpdf/kernel/font/PdfTrueTypeFont.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ This file is part of the iText (R) project.
4646
import com.itextpdf.io.font.FontEncoding;
4747
import com.itextpdf.io.font.FontNames;
4848
import com.itextpdf.io.font.TrueTypeFont;
49-
import com.itextpdf.io.font.cmap.CMapToUnicode;
5049
import com.itextpdf.io.font.otf.Glyph;
5150
import com.itextpdf.kernel.PdfException;
5251
import com.itextpdf.kernel.pdf.PdfDictionary;
@@ -88,9 +87,8 @@ public class PdfTrueTypeFont extends PdfSimpleFont<TrueTypeFont> {
8887
PdfTrueTypeFont(PdfDictionary fontDictionary) {
8988
super(fontDictionary);
9089
newFont = false;
91-
CMapToUnicode toUni = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
92-
fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUni);
93-
fontProgram = DocTrueTypeFont.createFontProgram(fontDictionary, fontEncoding, toUni);
90+
fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUnicode);
91+
fontProgram = DocTrueTypeFont.createFontProgram(fontDictionary, fontEncoding, toUnicode);
9492
embedded = ((IDocFontProgram) fontProgram).getFontFile() != null;
9593
subset = false;
9694
}

kernel/src/main/java/com/itextpdf/kernel/font/PdfType0Font.java

Lines changed: 14 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -516,24 +516,8 @@ private boolean isAppendableGlyph(Glyph glyph) {
516516
}
517517

518518
@Override
519-
// TODO refactor using decodeIntoGlyphLine?
520519
public String decode(PdfString content) {
521-
String cids = content.getValue();
522-
if (cids.length() == 1) {
523-
return "";
524-
}
525-
StringBuilder builder = new StringBuilder(cids.length() / 2);
526-
//number of cids must be even. With i < cids.length() - 1 we garantee, that we will not process the last odd index.
527-
for (int i = 0; i < cids.length() - 1; i += 2) {
528-
int code = (cids.charAt(i) << 8) + cids.charAt(i + 1);
529-
Glyph glyph = fontProgram.getGlyphByCode(cmapEncoding.getCidCode(code));
530-
if (glyph != null && glyph.getChars() != null) {
531-
builder.append(glyph.getChars());
532-
} else {
533-
builder.append('\ufffd');
534-
}
535-
}
536-
return builder.toString();
520+
return decodeIntoGlyphLine(content).toString();
537521
}
538522

539523
/**
@@ -549,7 +533,12 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
549533
//number of cids must be even. With i < cids.length() - 1 we guarantee, that we will not process the last odd index.
550534
for (int i = 0; i < cids.length() - 1; i += 2) {
551535
int code = (cids.charAt(i) << 8) + cids.charAt(i + 1);
552-
Glyph glyph = fontProgram.getGlyphByCode(cmapEncoding.getCidCode(code));
536+
int glyphCode = cmapEncoding.getCidCode(code);
537+
Glyph glyph = fontProgram.getGlyphByCode(glyphCode);
538+
if (glyph == null) {
539+
Logger logger = LoggerFactory.getLogger(PdfType0Font.class);
540+
logger.warn(MessageFormatUtil.format(LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE, glyphCode));
541+
}
553542
if (glyph != null && glyph.getChars() != null) {
554543
glyphs.add(glyph);
555544
} else {
@@ -560,24 +549,17 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
560549
}
561550

562551
@Override
563-
// TODO refactor using decodeIntoGlyphLine?
564552
public float getContentWidth(PdfString content) {
565-
String cids = content.getValue();
566553
Glyph notdef = fontProgram.getGlyphByCode(0);
567554
float width = 0;
568-
for (int i = 0; i < cids.length(); i++) {
569-
int code = cids.charAt(i++);
570-
if (i < cids.length()) {
571-
code <<= 8;
572-
code |= cids.charAt(i);
573-
}
574-
int glyphCode = cmapEncoding.getCidCode(code);
575-
Glyph glyph = fontProgram.getGlyphByCode(glyphCode);
576-
if (glyph == null) {
577-
Logger logger = LoggerFactory.getLogger(PdfType0Font.class);
578-
logger.warn(MessageFormatUtil.format(LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE, glyphCode));
555+
GlyphLine glyphLine = decodeIntoGlyphLine(content);
556+
for (int i = glyphLine.start; i < glyphLine.end; i++) {
557+
Glyph glyph = glyphLine.get(i);
558+
if (glyph.getCode() >= 0) {
559+
width += glyph.getWidth();
560+
} else {
561+
width += notdef.getWidth();
579562
}
580-
width += glyph != null ? glyph.getWidth() : notdef.getWidth();
581563
}
582564
return width;
583565
}

kernel/src/main/java/com/itextpdf/kernel/font/PdfType1Font.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ This file is part of the iText (R) project.
4545

4646
import com.itextpdf.io.font.FontEncoding;
4747
import com.itextpdf.io.font.Type1Font;
48-
import com.itextpdf.io.font.cmap.CMapToUnicode;
4948
import com.itextpdf.io.font.otf.Glyph;
5049
import com.itextpdf.kernel.pdf.PdfDictionary;
5150
import com.itextpdf.kernel.pdf.PdfName;
@@ -77,11 +76,10 @@ public class PdfType1Font extends PdfSimpleFont<Type1Font> {
7776
PdfType1Font(PdfDictionary fontDictionary) {
7877
super(fontDictionary);
7978
newFont = false;
80-
CMapToUnicode toUni = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
8179
//if there is no FontDescriptor, it is most likely one of the Standard Font with StandardEncoding as base encoding.
8280
boolean fillStandardEncoding = !fontDictionary.containsKey(PdfName.FontDescriptor);
83-
fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUni);
84-
fontProgram = DocType1Font.createFontProgram(fontDictionary, fontEncoding, toUni);
81+
fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUnicode);
82+
fontProgram = DocType1Font.createFontProgram(fontDictionary, fontEncoding, toUnicode);
8583

8684
if (fontProgram instanceof IDocFontProgram) {
8785
embedded = ((IDocFontProgram) fontProgram).getFontFile() != null;

kernel/src/main/java/com/itextpdf/kernel/font/PdfType3Font.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,7 @@ public class PdfType3Font extends PdfSimpleFont<Type3FontProgram> {
9797
subset = true;
9898
embedded = true;
9999
fontProgram = new Type3FontProgram(false);
100-
CMapToUnicode toUni = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
101-
fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUni);
100+
fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUnicode);
102101
PdfDictionary charProcsDic = getPdfObject().getAsDictionary(PdfName.CharProcs);
103102
PdfArray fontMatrixArray = getPdfObject().getAsArray(PdfName.FontMatrix);
104103
if (getPdfObject().containsKey(PdfName.FontBBox)) {
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package com.itextpdf.kernel.pdf.canvas.parser;
2+
3+
import com.itextpdf.kernel.pdf.PdfDocument;
4+
import com.itextpdf.kernel.pdf.PdfReader;
5+
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
6+
import com.itextpdf.test.ExtendedITextTest;
7+
import org.junit.Assert;
8+
import org.junit.Test;
9+
10+
import java.io.IOException;
11+
12+
public class SimpleFontToUnicodeExtractionTest extends ExtendedITextTest {
13+
14+
private static final String sourceFolder = "./src/test/resources/com/itextpdf/kernel/parser/SimpleFontToUnicodeExtractionTest/";
15+
16+
@Test
17+
public void test01() throws IOException {
18+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "simpleFontToUnicode01.pdf"));
19+
String expected = "Information plays a central role in soci-\n" +
20+
"ety today, and it is becoming more and \n" +
21+
"more common for that information to \n" +
22+
"be offered in digital form alone. The re-\n" +
23+
"liable, user-friendly Portable Document \n" +
24+
"Format (PDF) has become the world’s \n" +
25+
"file type of choice for providing infor-\n" +
26+
"mation as a digital document. \n" +
27+
"Tags can be added to a PDF in order \n" +
28+
"to structure the content of a document. \n" +
29+
"These tags are a critical requirement if \n" +
30+
"any form of assistive technology (such \n" +
31+
"as screen readers, specialist mice, and \n" +
32+
"speech recognition and text-to-speech \n" +
33+
"software) is to gain access to this con-\n" +
34+
"tent. To date, PDF documents have rare-\n" +
35+
"ly been tagged, and not all software can \n" +
36+
"make use of PDF tags. In practical terms, \n" +
37+
"this particularly reduces information‘s \n" +
38+
"accessibility for people with disabilities \n" +
39+
"who rely on assistive technology.";
40+
41+
String actualText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1),
42+
new LocationTextExtractionStrategy());
43+
44+
Assert.assertEquals(expected, actualText);
45+
}
46+
47+
@Test
48+
public void test02() throws IOException {
49+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "simpleFontToUnicode02.pdf"));
50+
String expected = "ffaast";
51+
52+
String actualText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1),
53+
new LocationTextExtractionStrategy());
54+
55+
Assert.assertEquals(expected, actualText);
56+
}
57+
58+
}

kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/TextRenderInfoTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ public void testCharacterRenderInfos() throws Exception {
8585
* Japanese. TextRenderInfo threw an AIOOBE for some characters.
8686
*/
8787
@Test
88-
@LogMessages(messages = {@LogMessage(messageTemplate = LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE)})
88+
@LogMessages(messages = {@LogMessage(messageTemplate = LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE, count = 2)})
8989
public void testUnicodeEmptyString() throws Exception {
9090
StringBuilder sb = new StringBuilder();
9191
String inFile = "japanese_text.pdf";

0 commit comments

Comments
 (0)