Skip to content

Commit b663bf5

Browse files
Improve string heuristics in content stream pane
- Add more symbols to PERMITTED_SYMBOLS - Instead of considering a string to be binary if it contains one "bad" character, use a threshold of 10% of the string length as the cutoff value. - Exception: strings containing character codes that aren't defined in PDFDocEncoding are always considered binary. RES-677
1 parent 038aa0d commit b663bf5

File tree

2 files changed

+151
-3
lines changed

2 files changed

+151
-3
lines changed

src/main/java/com/itextpdf/rups/view/itext/contentstream/ContentStreamHandlingUtils.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ final class ContentStreamHandlingUtils {
6161
* Symbols permitted when guessing whether a string without obvious encoding
6262
* is text or not.
6363
*/
64-
private static final String PERMITTED_SYMBOLS = "<>()\\/?.!{} \n\t";
64+
private static final String PERMITTED_SYMBOLS = "<>()\\/?.!{}[]-_@$€#%&*^+=`~,;:|'\" \n\t";
65+
66+
private static final double PERMITTED_NONTEXT_PROPORTION = 0.1f;
6567

6668
private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray();
6769

@@ -142,13 +144,19 @@ static boolean isMaybePdfDocEncodedText(byte[] b) {
142144
// No explicit encoding -> make an attempt with PDFDocEncoding
143145
final String asPdfDoc = PdfEncodings.convertToString(b, PdfEncodings.PDF_DOC_ENCODING);
144146
// check whether the result looks like a sensible text string
147+
int unexpectedCharacters = 0;
145148
for (int i = 0; i < asPdfDoc.length(); i++) {
146149
final char c = asPdfDoc.charAt(i);
147-
if (!Character.isLetterOrDigit(c) && PERMITTED_SYMBOLS.indexOf(c) == -1) {
150+
// if the codepoint is undefined in PDFDocEncoding -> immediately assume binary
151+
// See Annex D.3 in ISO 32000-2:2020: all values under ^W excluding tab, LF and CR
152+
if(c != '\n' && c != '\t' && c != '\r' && c <= '\027') {
148153
return false;
149154
}
155+
if(!Character.isLetterOrDigit(c) && PERMITTED_SYMBOLS.indexOf(c) == -1) {
156+
unexpectedCharacters += 1;
157+
}
150158
}
151-
return true;
159+
return unexpectedCharacters <= PERMITTED_NONTEXT_PROPORTION * asPdfDoc.length();
152160
}
153161

154162
private static int hexDigitValue(char c) {
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
This file is part of the iText (R) project.
3+
Copyright (c) 1998-2022 iText Group NV
4+
Authors: iText Software.
5+
6+
This program is free software; you can redistribute it and/or modify
7+
it under the terms of the GNU Affero General Public License version 3
8+
as published by the Free Software Foundation with the addition of the
9+
following permission added to Section 15 as permitted in Section 7(a):
10+
FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
11+
ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
12+
OF THIRD PARTY RIGHTS
13+
14+
This program is distributed in the hope that it will be useful, but
15+
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16+
or FITNESS FOR A PARTICULAR PURPOSE.
17+
See the GNU Affero General Public License for more details.
18+
You should have received a copy of the GNU Affero General Public License
19+
along with this program; if not, see http://www.gnu.org/licenses or write to
20+
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21+
Boston, MA, 02110-1301 USA, or download the license from the following URL:
22+
http://itextpdf.com/terms-of-use/
23+
24+
The interactive user interfaces in modified source and object code versions
25+
of this program must display Appropriate Legal Notices, as required under
26+
Section 5 of the GNU Affero General Public License.
27+
28+
In accordance with Section 7(b) of the GNU Affero General Public License,
29+
a covered work must retain the producer line in every PDF that is created
30+
or manipulated using iText.
31+
32+
You can be released from the requirements of the license by purchasing
33+
a commercial license. Buying such a license is mandatory as soon as you
34+
develop commercial activities involving the iText software without
35+
disclosing the source code of your own applications.
36+
These activities include: offering paid services to customers as an ASP,
37+
serving PDFs on the fly in a web application, shipping iText with a closed
38+
source product.
39+
40+
For more information, please contact iText Software Corp. at this
41+
42+
*/
43+
package com.itextpdf.rups.view.itext.contentstream;
44+
45+
import com.itextpdf.io.font.PdfEncodings;
46+
import com.itextpdf.test.annotations.type.UnitTest;
47+
import org.junit.Assert;
48+
import org.junit.Test;
49+
import org.junit.experimental.categories.Category;
50+
import org.junit.runner.RunWith;
51+
import org.junit.runners.Parameterized;
52+
53+
import java.nio.charset.StandardCharsets;
54+
import java.util.ArrayList;
55+
import java.util.Collection;
56+
57+
@RunWith(Parameterized.class)
58+
@Category(UnitTest.class)
59+
public class PdfDocEncodingHeuristicTest {
60+
@Parameterized.Parameters
61+
public static Collection<Object[]> data() {
62+
Collection<Object[]> cases = new ArrayList<>();
63+
String[] positiveStrings = new String[] {
64+
"abccadslk fjds",
65+
"abccadslk\tfjds",
66+
"abccadslk\nfjds",
67+
"abccadslk\rfjds",
68+
"/+xy1209837a$^!@$#&#*!&dksjfao7210",
69+
"/+xy120921312½",
70+
"en_US", "en-US",
71+
72+
"© iText Software",
73+
"Bär"
74+
};
75+
76+
byte[][] positiveBytes = new byte[][] {
77+
new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x68, 0x65, 0x6c, 0x6c, 0x6f },
78+
new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, (byte) 0x92, 0x68, 0x65, 0x6c, 0x6c, 0x6f }
79+
};
80+
81+
String[] negativeStrings = new String[] {
82+
"©z®z",
83+
"/+xy2½",
84+
"abccadslk\ffjds", // linefeed is whitespace, but undefined in PDFDocEncoding
85+
"Hello\007world" // non-whitespace control character
86+
};
87+
88+
byte[][] negativeBytes = new byte[][] {
89+
// utf8 rendering of ä doesn't represent a letter in PDFDocEncoding
90+
"Bär".getBytes(StandardCharsets.UTF_8),
91+
// proportion of non-letter bytes too high
92+
new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, (byte) 0x92},
93+
// no non-letter bytes at all
94+
new byte[] { 0x01, 0x02, 0x03, 0x04 },
95+
// contains control character that isn't whitespace
96+
new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x01, 0x68, 0x65, 0x6c, 0x6c, 0x6f }
97+
};
98+
99+
for(String s : positiveStrings) {
100+
cases.add(new Object[] {
101+
PdfEncodings.convertToBytes(s, PdfEncodings.PDF_DOC_ENCODING),
102+
true
103+
});
104+
}
105+
106+
for(byte[] b : positiveBytes) {
107+
cases.add(new Object[] {b, true});
108+
}
109+
110+
for(String s : negativeStrings) {
111+
cases.add(new Object[] {
112+
PdfEncodings.convertToBytes(s, PdfEncodings.PDF_DOC_ENCODING),
113+
false
114+
});
115+
}
116+
117+
for(byte[] b : negativeBytes) {
118+
cases.add(new Object[] {b, false});
119+
}
120+
121+
return cases;
122+
}
123+
124+
private final byte[] encoded;
125+
private final boolean textExpected;
126+
127+
public PdfDocEncodingHeuristicTest(byte[] encoded, boolean textExpected) {
128+
this.encoded = encoded;
129+
this.textExpected = textExpected;
130+
}
131+
132+
@Test
133+
public void testPdfDocTextHeuristic() {
134+
boolean result = ContentStreamHandlingUtils.isMaybePdfDocEncodedText(this.encoded);
135+
136+
String asPdfDoc = PdfEncodings.convertToString(this.encoded, PdfEncodings.PDF_DOC_ENCODING);
137+
Assert.assertEquals(asPdfDoc, this.textExpected, result);
138+
}
139+
140+
}

0 commit comments

Comments
 (0)