Use /ToUnicode mappings to several chars during text extraction from simple fonts

Snipx · Snipx · commit 121608931db1 · 2017-12-06T17:23:34.000+03:00
Refactor existing PdfFont#decode(PdfString) and PdfFont#getContentWidth(PdfString) methods DEVSIX-1681 (cherry picked from commit 9a3157e)
diff --git a/kernel/src/main/java/com/itextpdf/kernel/font/PdfSimpleFont.java b/kernel/src/main/java/com/itextpdf/kernel/font/PdfSimpleFont.java
@@ -49,6 +49,7 @@ This file is part of the iText (R) project.
 import com.itextpdf.io.font.FontNames;
 import com.itextpdf.io.font.FontProgram;
 import com.itextpdf.io.font.PdfEncodings;
+import com.itextpdf.io.font.cmap.CMapToUnicode;
 import com.itextpdf.io.font.otf.Glyph;
 import com.itextpdf.io.font.otf.GlyphLine;
 import com.itextpdf.io.util.ArrayUtil;
@@ -62,6 +63,7 @@ This file is part of the iText (R) project.
 import com.itextpdf.kernel.pdf.PdfString;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 public abstract class PdfSimpleFont<T extends FontProgram> extends PdfFont {
@@ -79,8 +81,15 @@ public abstract class PdfSimpleFont<T extends FontProgram> extends PdfFont {
      */
     protected byte[] shortTag = new byte[256];
 
+    /**
+     * Currently only exists for the fonts that are parsed from the document.
+     * In the future, we might provide possibility to add custom mappings after a font has been created from a font file.
+     */
+    protected CMapToUnicode toUnicode;
+
     protected PdfSimpleFont(PdfDictionary fontDictionary) {
         super(fontDictionary);
+        toUnicode = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
     }
 
     protected PdfSimpleFont() {
@@ -254,22 +263,8 @@ public void writeText(String text, PdfOutputStream stream) {
     }
 
     @Override
-    // TODO refactor using decodeIntoGlyphLine?
     public String decode(PdfString content) {
-        byte[] contentBytes = content.getValueBytes();
-        StringBuilder builder = new StringBuilder(contentBytes.length);
-        for (byte b : contentBytes) {
-            int uni = fontEncoding.getUnicode(b & 0xff);
-            if (uni > -1) {
-                builder.append((char) (int) uni);
-            } else if (fontEncoding.getBaseEncoding() == null) {
-                Glyph glyph = fontProgram.getGlyphByCode(b & 0xff);
-                if (glyph != null && glyph.getChars() != null) {
-                    builder.append(glyph.getChars());
-                }
-            }
-        }
-        return builder.toString();
+        return decodeIntoGlyphLine(content).toString();
     }
 
     /**
@@ -281,12 +276,20 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
         List<Glyph> glyphs = new ArrayList<>(contentBytes.length);
         for (byte b : contentBytes) {
             int code = b & 0xff;
-            int uni = fontEncoding.getUnicode(code);
             Glyph glyph = null;
-            if (uni > -1) {
-                glyph = getGlyph(uni);
-            } else if (fontEncoding.getBaseEncoding() == null) {
-                glyph = fontProgram.getGlyphByCode(code);
+            if (toUnicode != null && toUnicode.lookup(code) != null && (glyph = fontProgram.getGlyphByCode(code)) != null) {
+                if (!Arrays.equals(toUnicode.lookup(code), glyph.getChars())) {
+                    // Copy the glyph because the original one may be reused (e.g. standard Helvetica font program)
+                    glyph = new Glyph(glyph);
+                    glyph.setChars(toUnicode.lookup(code));
+                }
+            } else {
+                int uni = fontEncoding.getUnicode(code);
+                if (uni > -1) {
+                    glyph = getGlyph(uni);
+                } else if (fontEncoding.getBaseEncoding() == null) {
+                    glyph = fontProgram.getGlyphByCode(code);
+                }
             }
             if (glyph != null) {
                 glyphs.add(glyph);
@@ -296,19 +299,11 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
     }
 
     @Override
-    // TODO refactor using decodeIntoGlyphLine?
     public float getContentWidth(PdfString content) {
         float width = 0;
-        byte[] contentBytes = content.getValueBytes();
-        for (byte b : contentBytes) {
-            Glyph glyph = null;
-            int uni = fontEncoding.getUnicode(b & 0xff);
-            if (uni > -1) {
-                glyph = getGlyph(uni);
-            } else if (fontEncoding.getBaseEncoding() == null) {
-                glyph = fontProgram.getGlyphByCode(b & 0xff);
-            }
-            width += glyph != null ? glyph.getWidth() : 0;
+        GlyphLine glyphLine = decodeIntoGlyphLine(content);
+        for (int i = glyphLine.start; i < glyphLine.end; i++) {
+            width += glyphLine.get(i).getWidth();
         }
         return width;
     }
diff --git a/kernel/src/main/java/com/itextpdf/kernel/font/PdfTrueTypeFont.java b/kernel/src/main/java/com/itextpdf/kernel/font/PdfTrueTypeFont.java
@@ -46,7 +46,6 @@ This file is part of the iText (R) project.
 import com.itextpdf.io.font.FontEncoding;
 import com.itextpdf.io.font.FontNames;
 import com.itextpdf.io.font.TrueTypeFont;
-import com.itextpdf.io.font.cmap.CMapToUnicode;
 import com.itextpdf.io.font.otf.Glyph;
 import com.itextpdf.kernel.PdfException;
 import com.itextpdf.kernel.pdf.PdfDictionary;
@@ -88,9 +87,8 @@ public class PdfTrueTypeFont extends PdfSimpleFont<TrueTypeFont> {
     PdfTrueTypeFont(PdfDictionary fontDictionary) {
         super(fontDictionary);
         newFont = false;
-        CMapToUnicode toUni = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
-        fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUni);
-        fontProgram = DocTrueTypeFont.createFontProgram(fontDictionary, fontEncoding, toUni);
+        fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUnicode);
+        fontProgram = DocTrueTypeFont.createFontProgram(fontDictionary, fontEncoding, toUnicode);
         embedded = ((IDocFontProgram) fontProgram).getFontFile() != null;
         subset = false;
     }
diff --git a/kernel/src/main/java/com/itextpdf/kernel/font/PdfType0Font.java b/kernel/src/main/java/com/itextpdf/kernel/font/PdfType0Font.java
@@ -516,24 +516,8 @@ private boolean isAppendableGlyph(Glyph glyph) {
     }
 
     @Override
-    // TODO refactor using decodeIntoGlyphLine?
     public String decode(PdfString content) {
-        String cids = content.getValue();
-        if (cids.length() == 1) {
-            return "";
-        }
-        StringBuilder builder = new StringBuilder(cids.length() / 2);
-        //number of cids must be even. With i < cids.length() - 1 we garantee, that we will not process the last odd index.
-        for (int i = 0; i < cids.length() - 1; i += 2) {
-            int code = (cids.charAt(i) << 8) + cids.charAt(i + 1);
-            Glyph glyph = fontProgram.getGlyphByCode(cmapEncoding.getCidCode(code));
-            if (glyph != null && glyph.getChars() != null) {
-                builder.append(glyph.getChars());
-            } else {
-                builder.append('\ufffd');
-            }
-        }
-        return builder.toString();
+        return decodeIntoGlyphLine(content).toString();
     }
 
     /**
@@ -549,7 +533,12 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
         //number of cids must be even. With i < cids.length() - 1 we guarantee, that we will not process the last odd index.
         for (int i = 0; i < cids.length() - 1; i += 2) {
             int code = (cids.charAt(i) << 8) + cids.charAt(i + 1);
-            Glyph glyph = fontProgram.getGlyphByCode(cmapEncoding.getCidCode(code));
+            int glyphCode = cmapEncoding.getCidCode(code);
+            Glyph glyph = fontProgram.getGlyphByCode(glyphCode);
+            if (glyph == null) {
+                Logger logger = LoggerFactory.getLogger(PdfType0Font.class);
+                logger.warn(MessageFormatUtil.format(LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE, glyphCode));
+            }
             if (glyph != null && glyph.getChars() != null) {
                 glyphs.add(glyph);
             } else {
@@ -560,24 +549,17 @@ public GlyphLine decodeIntoGlyphLine(PdfString content) {
     }
 
     @Override
-    // TODO refactor using decodeIntoGlyphLine?
     public float getContentWidth(PdfString content) {
-        String cids = content.getValue();
         Glyph notdef = fontProgram.getGlyphByCode(0);
         float width = 0;
-        for (int i = 0; i < cids.length(); i++) {
-            int code = cids.charAt(i++);
-            if (i < cids.length()) {
-                code <<= 8;
-                code |= cids.charAt(i);
-            }
-            int glyphCode = cmapEncoding.getCidCode(code);
-            Glyph glyph = fontProgram.getGlyphByCode(glyphCode);
-            if (glyph == null) {
-                Logger logger = LoggerFactory.getLogger(PdfType0Font.class);
-                logger.warn(MessageFormatUtil.format(LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE, glyphCode));
+        GlyphLine glyphLine = decodeIntoGlyphLine(content);
+        for (int i = glyphLine.start; i < glyphLine.end; i++) {
+            Glyph glyph = glyphLine.get(i);
+            if (glyph.getCode() >= 0) {
+                width += glyph.getWidth();
+            } else {
+                width += notdef.getWidth();
             }
-            width += glyph != null ? glyph.getWidth() : notdef.getWidth();
         }
         return width;
     }
diff --git a/kernel/src/main/java/com/itextpdf/kernel/font/PdfType1Font.java b/kernel/src/main/java/com/itextpdf/kernel/font/PdfType1Font.java
@@ -45,7 +45,6 @@ This file is part of the iText (R) project.
 
 import com.itextpdf.io.font.FontEncoding;
 import com.itextpdf.io.font.Type1Font;
-import com.itextpdf.io.font.cmap.CMapToUnicode;
 import com.itextpdf.io.font.otf.Glyph;
 import com.itextpdf.kernel.pdf.PdfDictionary;
 import com.itextpdf.kernel.pdf.PdfName;
@@ -77,11 +76,10 @@ public class PdfType1Font extends PdfSimpleFont<Type1Font> {
     PdfType1Font(PdfDictionary fontDictionary) {
         super(fontDictionary);
         newFont = false;
-        CMapToUnicode toUni = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
         //if there is no FontDescriptor, it is most likely one of the Standard Font with StandardEncoding as base encoding.
         boolean fillStandardEncoding = !fontDictionary.containsKey(PdfName.FontDescriptor);
-        fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUni);
-        fontProgram = DocType1Font.createFontProgram(fontDictionary, fontEncoding, toUni);
+        fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUnicode);
+        fontProgram = DocType1Font.createFontProgram(fontDictionary, fontEncoding, toUnicode);
 
         if (fontProgram instanceof IDocFontProgram) {
             embedded = ((IDocFontProgram) fontProgram).getFontFile() != null;
diff --git a/kernel/src/main/java/com/itextpdf/kernel/font/PdfType3Font.java b/kernel/src/main/java/com/itextpdf/kernel/font/PdfType3Font.java
@@ -97,8 +97,7 @@ public class PdfType3Font extends PdfSimpleFont<Type3FontProgram> {
         subset = true;
         embedded = true;
         fontProgram = new Type3FontProgram(false);
-        CMapToUnicode toUni = FontUtil.processToUnicode(fontDictionary.get(PdfName.ToUnicode));
-        fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUni);
+        fontEncoding = DocFontEncoding.createDocFontEncoding(fontDictionary.get(PdfName.Encoding), toUnicode);
         PdfDictionary charProcsDic = getPdfObject().getAsDictionary(PdfName.CharProcs);
         PdfArray fontMatrixArray = getPdfObject().getAsArray(PdfName.FontMatrix);
         if (getPdfObject().containsKey(PdfName.FontBBox)) {
diff --git a/kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/SimpleFontToUnicodeExtractionTest.java b/kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/SimpleFontToUnicodeExtractionTest.java
@@ -0,0 +1,58 @@
+package com.itextpdf.kernel.pdf.canvas.parser;
+
+import com.itextpdf.kernel.pdf.PdfDocument;
+import com.itextpdf.kernel.pdf.PdfReader;
+import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
+import com.itextpdf.test.ExtendedITextTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class SimpleFontToUnicodeExtractionTest extends ExtendedITextTest {
+
+    private static final String sourceFolder = "./src/test/resources/com/itextpdf/kernel/parser/SimpleFontToUnicodeExtractionTest/";
+
+    @Test
+    public void test01() throws IOException {
+        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "simpleFontToUnicode01.pdf"));
+        String expected = "Information plays a central role in soci-\n" +
+                "ety today, and it is becoming more and \n" +
+                "more common for that information to \n" +
+                "be offered in digital form alone. The re-\n" +
+                "liable, user-friendly Portable Document \n" +
+                "Format (PDF) has become the world’s \n" +
+                "file type of choice for providing infor-\n" +
+                "mation as a digital document. \n" +
+                "Tags can be added to a PDF in order \n" +
+                "to structure the content of a document. \n" +
+                "These tags are a critical requirement if \n" +
+                "any form of assistive technology (such \n" +
+                "as screen readers, specialist mice, and \n" +
+                "speech recognition and text-to-speech \n" +
+                "software) is to gain access to this con-\n" +
+                "tent. To date, PDF documents have rare-\n" +
+                "ly been tagged, and not all software can \n" +
+                "make use of PDF tags. In practical terms, \n" +
+                "this particularly reduces information‘s \n" +
+                "accessibility for people with disabilities \n" +
+                "who rely on assistive technology.";
+
+        String actualText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1),
+                new LocationTextExtractionStrategy());
+
+        Assert.assertEquals(expected, actualText);
+    }
+
+    @Test
+    public void test02() throws IOException {
+        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "simpleFontToUnicode02.pdf"));
+        String expected = "ffaast";
+
+        String actualText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1),
+                new LocationTextExtractionStrategy());
+
+        Assert.assertEquals(expected, actualText);
+    }
+
+}
diff --git a/kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/TextRenderInfoTest.java b/kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/TextRenderInfoTest.java
@@ -85,7 +85,7 @@ public void testCharacterRenderInfos() throws Exception {
      * Japanese. TextRenderInfo threw an AIOOBE for some characters.
      */
     @Test
-    @LogMessages(messages = {@LogMessage(messageTemplate = LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE)})
+    @LogMessages(messages = {@LogMessage(messageTemplate = LogMessageConstant.COULD_NOT_FIND_GLYPH_WITH_CODE, count = 2)})
     public void testUnicodeEmptyString() throws Exception {
         StringBuilder sb = new StringBuilder();
         String inFile = "japanese_text.pdf";
diff --git a/kernel/src/test/resources/com/itextpdf/kernel/parser/SimpleFontToUnicodeExtractionTest/simpleFontToUnicode01.pdf b/kernel/src/test/resources/com/itextpdf/kernel/parser/SimpleFontToUnicodeExtractionTest/simpleFontToUnicode01.pdf
diff --git a/kernel/src/test/resources/com/itextpdf/kernel/parser/SimpleFontToUnicodeExtractionTest/simpleFontToUnicode02.pdf b/kernel/src/test/resources/com/itextpdf/kernel/parser/SimpleFontToUnicodeExtractionTest/simpleFontToUnicode02.pdf
diff --git a/kernel/src/test/resources/com/itextpdf/kernel/utils/TaggedPdfReaderToolTest/cmpXml01.xml b/kernel/src/test/resources/com/itextpdf/kernel/utils/TaggedPdfReaderToolTest/cmpXml01.xml