Improve string heuristics in content stream pane

MatthiasValvekens · MatthiasValvekens · commit b663bf57cd6f · 2022-05-11T13:51:32.000+02:00
- Add more symbols to PERMITTED_SYMBOLS
 - Instead of considering a string to be binary if it contains one
   "bad" character, use a threshold of 10% of the string length as
   the cutoff value.
 - Exception: strings containing character codes that aren't defined in
   PDFDocEncoding are always considered binary.

RES-677
diff --git a/src/main/java/com/itextpdf/rups/view/itext/contentstream/ContentStreamHandlingUtils.java b/src/main/java/com/itextpdf/rups/view/itext/contentstream/ContentStreamHandlingUtils.java
@@ -61,7 +61,9 @@ final class ContentStreamHandlingUtils {
      * Symbols permitted when guessing whether a string without obvious encoding
      * is text or not.
      */
-    private static final String PERMITTED_SYMBOLS = "<>()\\/?.!{} \n\t";
+    private static final String PERMITTED_SYMBOLS = "<>()\\/?.!{}[]-_@$€#%&*^+=`~,;:|'\" \n\t";
+
+    private static final double PERMITTED_NONTEXT_PROPORTION = 0.1f;
 
     private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
 
@@ -142,13 +144,19 @@ static boolean isMaybePdfDocEncodedText(byte[] b) {
         // No explicit encoding -> make an attempt with PDFDocEncoding
         final String asPdfDoc = PdfEncodings.convertToString(b, PdfEncodings.PDF_DOC_ENCODING);
         // check whether the result looks like a sensible text string
+        int unexpectedCharacters = 0;
         for (int i = 0; i < asPdfDoc.length(); i++) {
             final char c = asPdfDoc.charAt(i);
-            if (!Character.isLetterOrDigit(c) && PERMITTED_SYMBOLS.indexOf(c) == -1) {
+            // if the codepoint is undefined in PDFDocEncoding -> immediately assume binary
+            // See Annex D.3 in ISO 32000-2:2020: all values under ^W excluding tab, LF and CR
+            if(c != '\n' && c != '\t' && c != '\r' && c <= '\027') {
                 return false;
             }
+            if(!Character.isLetterOrDigit(c) && PERMITTED_SYMBOLS.indexOf(c) == -1) {
+                unexpectedCharacters += 1;
+            }
         }
-        return true;
+        return unexpectedCharacters <= PERMITTED_NONTEXT_PROPORTION * asPdfDoc.length();
     }
 
     private static int hexDigitValue(char c) {
diff --git a/src/test/java/com/itextpdf/rups/view/itext/contentstream/PdfDocEncodingHeuristicTest.java b/src/test/java/com/itextpdf/rups/view/itext/contentstream/PdfDocEncodingHeuristicTest.java
@@ -0,0 +1,140 @@
+/*
+    This file is part of the iText (R) project.
+    Copyright (c) 1998-2022 iText Group NV
+    Authors: iText Software.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License version 3
+    as published by the Free Software Foundation with the addition of the
+    following permission added to Section 15 as permitted in Section 7(a):
+    FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
+    ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
+    OF THIRD PARTY RIGHTS
+
+    This program is distributed in the hope that it will be useful, but
+    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE.
+    See the GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program; if not, see http://www.gnu.org/licenses or write to
+    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+    Boston, MA, 02110-1301 USA, or download the license from the following URL:
+    http://itextpdf.com/terms-of-use/
+
+    The interactive user interfaces in modified source and object code versions
+    of this program must display Appropriate Legal Notices, as required under
+    Section 5 of the GNU Affero General Public License.
+
+    In accordance with Section 7(b) of the GNU Affero General Public License,
+    a covered work must retain the producer line in every PDF that is created
+    or manipulated using iText.
+
+    You can be released from the requirements of the license by purchasing
+    a commercial license. Buying such a license is mandatory as soon as you
+    develop commercial activities involving the iText software without
+    disclosing the source code of your own applications.
+    These activities include: offering paid services to customers as an ASP,
+    serving PDFs on the fly in a web application, shipping iText with a closed
+    source product.
+
+    For more information, please contact iText Software Corp. at this
+    address: sales@itextpdf.com
+ */
+package com.itextpdf.rups.view.itext.contentstream;
+
+import com.itextpdf.io.font.PdfEncodings;
+import com.itextpdf.test.annotations.type.UnitTest;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collection;
+
+@RunWith(Parameterized.class)
+@Category(UnitTest.class)
+public class PdfDocEncodingHeuristicTest {
+    @Parameterized.Parameters
+    public static Collection<Object[]> data() {
+        Collection<Object[]> cases = new ArrayList<>();
+        String[] positiveStrings = new String[] {
+                "abccadslk fjds",
+                "abccadslk\tfjds",
+                "abccadslk\nfjds",
+                "abccadslk\rfjds",
+                "/+xy1209837a$^!@$#&#*!&dksjfao7210",
+                "/+xy120921312½",
+                "en_US", "en-US",
+                "test@example.com",
+                "© iText Software",
+                "Bär"
+        };
+
+        byte[][] positiveBytes = new byte[][] {
+                new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x68, 0x65, 0x6c, 0x6c, 0x6f },
+                new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, (byte) 0x92, 0x68, 0x65, 0x6c, 0x6c, 0x6f }
+        };
+
+        String[] negativeStrings = new String[] {
+                "©z®z",
+                "/+xy2½",
+                "abccadslk\ffjds", // linefeed is whitespace, but undefined in PDFDocEncoding
+                "Hello\007world" // non-whitespace control character
+        };
+
+        byte[][] negativeBytes = new byte[][] {
+                // utf8 rendering of ä doesn't represent a letter in PDFDocEncoding
+                "Bär".getBytes(StandardCharsets.UTF_8),
+                // proportion of non-letter bytes too high
+                new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, (byte) 0x92},
+                // no non-letter bytes at all
+                new byte[] { 0x01, 0x02, 0x03, 0x04 },
+                // contains control character that isn't whitespace
+                new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x01, 0x68, 0x65, 0x6c, 0x6c, 0x6f }
+        };
+
+        for(String s : positiveStrings) {
+            cases.add(new Object[] {
+                    PdfEncodings.convertToBytes(s, PdfEncodings.PDF_DOC_ENCODING),
+                    true
+            });
+        }
+
+        for(byte[] b : positiveBytes) {
+            cases.add(new Object[] {b, true});
+        }
+
+        for(String s : negativeStrings) {
+            cases.add(new Object[] {
+                    PdfEncodings.convertToBytes(s, PdfEncodings.PDF_DOC_ENCODING),
+                    false
+            });
+        }
+
+        for(byte[] b : negativeBytes) {
+            cases.add(new Object[] {b, false});
+        }
+
+        return cases;
+    }
+
+    private final byte[] encoded;
+    private final boolean textExpected;
+
+    public PdfDocEncodingHeuristicTest(byte[] encoded, boolean textExpected) {
+        this.encoded = encoded;
+        this.textExpected = textExpected;
+    }
+
+    @Test
+    public void testPdfDocTextHeuristic() {
+        boolean result = ContentStreamHandlingUtils.isMaybePdfDocEncodedText(this.encoded);
+
+        String asPdfDoc = PdfEncodings.convertToString(this.encoded, PdfEncodings.PDF_DOC_ENCODING);
+        Assert.assertEquals(asPdfDoc, this.textExpected, result);
+    }
+
+}