Use directly toUnicode CMap in embedded case for text extraction

introfog · introfog · commit 001947588cf1 · 2023-03-29T08:03:07.000Z
DEVSIX-6147
diff --git a/io/src/main/java/com/itextpdf/io/font/CMapEncoding.java b/io/src/main/java/com/itextpdf/io/font/CMapEncoding.java
@@ -191,26 +191,8 @@ public int getCidCode(int cmapCode) {
         }
     }
 
-    public boolean containsCodeInCodeSpaceRange(int code, int length) {
-        for (int i = 0; i < codeSpaceRanges.size(); i += 2) {
-            if (length == codeSpaceRanges.get(i).length) {
-                int mask = 0xff;
-                int totalShift = 0;
-                byte[] low = codeSpaceRanges.get(i);
-                byte[] high = codeSpaceRanges.get(i + 1);
-                boolean fitsIntoRange = true;
-                for (int ind = length - 1; ind >= 0; ind--, totalShift += 8, mask <<= 8) {
-                    int actualByteValue = (code & mask) >> totalShift;
-                    if (!(actualByteValue >= (0xff & low[ind]) && actualByteValue <= (0xff & high[ind]))) {
-                        fitsIntoRange = false;
-                    }
-                }
-                if (fitsIntoRange) {
-                    return true;
-                }
-            }
-        }
-        return false;
+    public List<byte[]> getCodeSpaceRanges() {
+        return codeSpaceRanges;
     }
 
     private static CMapCodepointToCid getCodeToCidCmap(String cmap, CMapCidToCodepoint cid2Code) {
diff --git a/io/src/main/java/com/itextpdf/io/font/cmap/CMapToUnicode.java b/io/src/main/java/com/itextpdf/io/font/cmap/CMapToUnicode.java
@@ -25,13 +25,15 @@ This file is part of the iText (R) project.
 import com.itextpdf.io.logs.IoLogMessageConstant;
 import com.itextpdf.io.util.IntHashtable;
 import com.itextpdf.io.util.TextUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * This class represents a CMap file.
@@ -42,7 +44,9 @@ public class CMapToUnicode extends AbstractCMap {
 
     public static CMapToUnicode EmptyCMapToUnicodeMap = new CMapToUnicode(true);
 
-    private Map<Integer, char[]> byteMappings;
+    private final Map<Integer, char[]> byteMappings;
+
+    private final List<byte[]> codeSpaceRanges = new ArrayList<>();
 
     private CMapToUnicode(boolean emptyCMap) {
         byteMappings = Collections.<Integer, char[]>emptyMap();
@@ -60,6 +64,7 @@ public static CMapToUnicode getIdentity() {
         for (int i = 0; i < 65537; i++) {
             uni.addChar(i, TextUtil.convertFromUtf32(i));
         }
+        uni.addCodeSpaceRange(new byte[] {0, 0}, new byte[] {(byte) 0xff, (byte) 0xff});
         return uni;
     }
 
@@ -128,6 +133,22 @@ public Map<Integer, Integer> createReverseMapping() {
         return result;
     }
 
+    /**
+     * Returns a list containing sequential pairs of code space beginning and endings:
+     * (begincodespacerange1, endcodespacerange1, begincodespacerange2, endcodespacerange1, ...)
+     *
+     * @return list of {@code byte[]} that contain code space ranges
+     */
+    public List<byte[]> getCodeSpaceRanges() {
+        return codeSpaceRanges;
+    }
+
+    @Override
+    void addCodeSpaceRange(byte[] low, byte[] high) {
+        codeSpaceRanges.add(low);
+        codeSpaceRanges.add(high);
+    }
+
     private int convertToInt(char[] s) {
         int value = 0;
         for (int i = 0; i < s.length - 1; i++) {
diff --git a/kernel/src/main/java/com/itextpdf/kernel/font/DocTrueTypeFont.java b/kernel/src/main/java/com/itextpdf/kernel/font/DocTrueTypeFont.java
@@ -22,19 +22,20 @@ This file is part of the iText (R) project.
  */
 package com.itextpdf.kernel.font;
 
-import com.itextpdf.io.font.FontProgram;
-import com.itextpdf.io.logs.IoLogMessageConstant;
 import com.itextpdf.io.font.FontEncoding;
+import com.itextpdf.io.font.FontProgram;
 import com.itextpdf.io.font.TrueTypeFont;
 import com.itextpdf.io.font.cmap.CMapToUnicode;
 import com.itextpdf.io.font.otf.Glyph;
+import com.itextpdf.io.logs.IoLogMessageConstant;
 import com.itextpdf.io.util.IntHashtable;
 import com.itextpdf.kernel.pdf.PdfArray;
 import com.itextpdf.kernel.pdf.PdfDictionary;
 import com.itextpdf.kernel.pdf.PdfName;
 import com.itextpdf.kernel.pdf.PdfNumber;
 import com.itextpdf.kernel.pdf.PdfStream;
 import com.itextpdf.kernel.pdf.PdfString;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -110,12 +111,7 @@ static TrueTypeFont createFontProgram(PdfDictionary fontDictionary, CMapToUnicod
             fontProgram.avgWidth = 0;
             for (int cid : toUnicode.getCodes()) {
                 final int width = widths.containsKey(cid) ? widths.get(cid) : defaultWidth;
-                Glyph glyph = new Glyph(cid, width, toUnicode.lookup(cid));
-                if (glyph.hasValidUnicode()) {
-                    fontProgram.unicodeToGlyph.put(glyph.getUnicode(), glyph);
-                }
-                fontProgram.codeToGlyph.put(cid, glyph);
-                fontProgram.avgWidth += width;
+                fontProgram.registerGlyph(cid, width, toUnicode.lookup(cid));
             }
             if (fontProgram.codeToGlyph.size() != 0) {
                 fontProgram.avgWidth /= fontProgram.codeToGlyph.size();
@@ -267,4 +263,13 @@ static void fillFontDescriptor(DocTrueTypeFont font, PdfDictionary fontDesc) {
             }
         }
     }
+
+    private void registerGlyph(int cid, int width, char[] unicode) {
+        Glyph glyph = new Glyph(cid, width, unicode);
+        if (glyph.hasValidUnicode()) {
+            this.unicodeToGlyph.put(glyph.getUnicode(), glyph);
+        }
+        this.codeToGlyph.put(cid, glyph);
+        this.avgWidth += width;
+    }
 }
diff --git a/kernel/src/main/java/com/itextpdf/kernel/font/PdfType0Font.java b/kernel/src/main/java/com/itextpdf/kernel/font/PdfType0Font.java
@@ -91,6 +91,8 @@ public class PdfType0Font extends PdfFont {
     protected int cidFontType;
     protected char[] specificUnicodeDifferences;
 
+    private final CMapToUnicode embeddedToUnicode;
+
     PdfType0Font(TrueTypeFont ttf, String cmap) {
         super();
         if (!PdfEncodings.IDENTITY_H.equals(cmap) && !PdfEncodings.IDENTITY_V.equals(cmap)) {
@@ -107,6 +109,7 @@ public class PdfType0Font extends PdfFont {
         cmapEncoding = new CMapEncoding(cmap);
         usedGlyphs = new TreeSet<>();
         cidFontType = CID_FONT_TYPE_2;
+        embeddedToUnicode = null;
         if (ttf.isFontSpecific()) {
             specificUnicodeDifferences = new char[256];
             byte[] bytes = new byte[1];
@@ -135,6 +138,7 @@ public class PdfType0Font extends PdfFont {
         cmapEncoding = new CMapEncoding(cmap, uniMap);
         usedGlyphs = new TreeSet<>();
         cidFontType = CID_FONT_TYPE_0;
+        embeddedToUnicode = null;
     }
 
     PdfType0Font(PdfDictionary fontDictionary) {
@@ -151,8 +155,10 @@ public class PdfType0Font extends PdfFont {
         PdfObject toUnicode = fontDictionary.get(PdfName.ToUnicode);
         if (toUnicode == null) {
             toUnicodeCMap = FontUtil.parseUniversalToUnicodeCMap(ordering);
+            embeddedToUnicode = null;
         } else {
             toUnicodeCMap = FontUtil.processToUnicode(toUnicode);
+            embeddedToUnicode = toUnicodeCMap;
         }
 
         if (cmap.isName() && (PdfEncodings.IDENTITY_H.equals(((PdfName) cmap).getValue()) ||
@@ -555,6 +561,11 @@ public GlyphLine decodeIntoGlyphLine(PdfString characterCodes) {
     public boolean appendDecodedCodesToGlyphsList(List<Glyph> list, PdfString characterCodes) {
         boolean allCodesDecoded = true;
 
+        final boolean isToUnicodeEmbedded = embeddedToUnicode != null;
+        final CMapEncoding cmap = getCmap();
+        final FontProgram fontProgram = getFontProgram();
+        final List<byte[]> codeSpaceRanges = isToUnicodeEmbedded ? embeddedToUnicode.getCodeSpaceRanges() : cmap.getCodeSpaceRanges();
+
         String charCodesSequence = characterCodes.getValue();
         // A sequence of one or more bytes shall be extracted from the string and matched against the codespace
         // ranges in the CMap. That is, the first byte shall be matched against 1-byte codespace ranges; if no match is
@@ -568,13 +579,18 @@ public boolean appendDecodedCodesToGlyphsList(List<Glyph> list, PdfString charac
             for (int codeLength = 1; codeLength <= MAX_CID_CODE_LENGTH && i + codeLength <= charCodesSequence.length();
                     codeLength++) {
                 code = (code << 8) + charCodesSequence.charAt(i + codeLength - 1);
-                if (!getCmap().containsCodeInCodeSpaceRange(code, codeLength)) {
-                    continue;
-                } else {
+
+                if (PdfType0Font.containsCodeInCodeSpaceRange(codeSpaceRanges, code, codeLength)) {
                     codeSpaceMatchedLength = codeLength;
+                } else {
+                    continue;
                 }
-                int glyphCode = getCmap().getCidCode(code);
-                glyph = getFontProgram().getGlyphByCode(glyphCode);
+
+                // According to paragraph 9.10.2 of PDF Specification ISO 32000-2, if toUnicode is embedded, it is
+                // necessary to use it to map directly code points to unicode. If not embedded, use CMap to map code
+                // points to CIDs and then CIDFont to map CIDs to unicode.
+                int glyphCode = isToUnicodeEmbedded ? code : cmap.getCidCode(code);
+                glyph = fontProgram.getGlyphByCode(glyphCode);
                 if (glyph != null) {
                     i += codeLength - 1;
                     break;
@@ -594,11 +610,11 @@ public boolean appendDecodedCodesToGlyphsList(List<Glyph> list, PdfString charac
                 }
                 i += codeSpaceMatchedLength - 1;
             }
-            if (glyph != null && glyph.getChars() != null) {
-                list.add(glyph);
-            } else {
-                list.add(new Glyph(0, getFontProgram().getGlyphByCode(0).getWidth(), -1));
+            if (glyph == null || glyph.getChars() == null) {
+                list.add(new Glyph(0, fontProgram.getGlyphByCode(0).getWidth(), -1));
                 allCodesDecoded = false;
+            } else {
+                list.add(glyph);
             }
         }
         return allCodesDecoded;
@@ -674,6 +690,28 @@ private static String getOrdering(PdfDictionary cidFont) {
         return cidinfo.containsKey(PdfName.Ordering) ? cidinfo.get(PdfName.Ordering).toString() : null;
     }
 
+    private static boolean containsCodeInCodeSpaceRange(List<byte[]> codeSpaceRanges, int code, int length) {
+        for (int i = 0; i < codeSpaceRanges.size(); i += 2) {
+            if (length == codeSpaceRanges.get(i).length) {
+                int mask = 0xff;
+                int totalShift = 0;
+                byte[] low = codeSpaceRanges.get(i);
+                byte[] high = codeSpaceRanges.get(i + 1);
+                boolean fitsIntoRange = true;
+                for (int ind = length - 1; ind >= 0; ind--, totalShift += 8, mask <<= 8) {
+                    int actualByteValue = (code & mask) >> totalShift;
+                    if (!(actualByteValue >= (0xff & low[ind]) && actualByteValue <= (0xff & high[ind]))) {
+                        fitsIntoRange = false;
+                    }
+                }
+                if (fitsIntoRange) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
     private void flushFontData() {
         if (cidFontType == CID_FONT_TYPE_0) {
             getPdfObject().put(PdfName.Type, PdfName.Font);
diff --git a/kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/TextRenderInfoTest.java b/kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/TextRenderInfoTest.java
@@ -113,6 +113,18 @@ public void testDoubleMappedCharacterExtraction() throws IOException {
         Assert.assertEquals(expectedResult, result);
     }
 
+    @Test
+    public void testEmbeddedIdentityToUnicodeTest() throws IOException {
+        String inFile = "embedded_identity_to_unicode.pdf";
+        String expectedResult = "Regular hyphen [\u002d] and non-breaking hyphen [\u2011] (both CID 14)\n"
+                + "Turtle kyuujitai [\u9f9c] and turtle radical [\u2fd4] (both CID 7472)";
+
+        PdfDocument pdfDocument = new PdfDocument(new PdfReader(SOURCE_FOLDER + inFile));
+        ITextExtractionStrategy start = new SimpleTextExtractionStrategy();
+
+        String result = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(FIRST_PAGE), start).trim();
+        Assert.assertEquals(expectedResult, result);
+    }
 
     private static class TextPositionEventListener implements IEventListener {
         List<LineSegment> lineSegments = new ArrayList<>();
diff --git a/kernel/src/test/resources/com/itextpdf/kernel/parser/TextRenderInfoTest/embedded_identity_to_unicode.pdf b/kernel/src/test/resources/com/itextpdf/kernel/parser/TextRenderInfoTest/embedded_identity_to_unicode.pdf