Skip to content

Commit 825200c

Browse files
author
Alan Goo
committed
Fix PdfSweep failure on documents with ligatures
DEVSIX-1940
1 parent 540ec89 commit 825200c

File tree

4 files changed

+139
-10
lines changed

4 files changed

+139
-10
lines changed

kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/listener/CharacterRenderInfo.java

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,19 +72,16 @@ static StringConversionInfo mapString(List<CharacterRenderInfo> cris) {
7272
for (int i = 0; i < cris.size(); i++) {
7373
CharacterRenderInfo chunk = cris.get(i);
7474
if (lastChunk == null) {
75-
indexMap.put(sb.length(), i);
76-
sb.append(chunk.getText());
75+
putCharsWithIndex(chunk.getText(), i, indexMap, sb);
7776
} else {
7877
if (chunk.sameLine(lastChunk)) {
7978
// we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
8079
if (chunk.getLocation().isAtWordBoundary(lastChunk.getLocation()) && !chunk.getText().startsWith(" ") && !chunk.getText().endsWith(" ")) {
8180
sb.append(' ');
8281
}
83-
indexMap.put(sb.length(), i);
84-
sb.append(chunk.getText());
82+
putCharsWithIndex(chunk.getText(), i, indexMap, sb);
8583
} else {
86-
indexMap.put(sb.length(), i);
87-
sb.append(chunk.getText());
84+
putCharsWithIndex(chunk.getText(), i, indexMap, sb);
8885
}
8986
}
9087
lastChunk = chunk;
@@ -95,12 +92,18 @@ static StringConversionInfo mapString(List<CharacterRenderInfo> cris) {
9592
return ret;
9693
}
9794

95+
private static void putCharsWithIndex(final CharSequence seq, int index, final Map<Integer, Integer> indexMap, StringBuilder sb) {
96+
int charCount = seq.length();
97+
for (int i = 0; i < charCount; i++) {
98+
indexMap.put(sb.length(), index);
99+
sb.append(seq.charAt(i));
100+
}
101+
}
102+
98103
public CharacterRenderInfo(TextRenderInfo tri) {
99104
super(tri == null ? "" : tri.getText(), tri == null ? null : getLocation(tri));
100105
if (tri == null)
101106
throw new IllegalArgumentException("TextRenderInfo argument is not nullable.");
102-
if (tri.getText().length() != 1)
103-
throw new IllegalArgumentException("CharacterRenderInfo objects represent a single character. They should not be made from TextRenderInfo objects containing more than a single character of text.");
104107

105108
// determine bounding box
106109
float x0 = tri.getDescentLine().getStartPoint().get(0);

kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/listener/RegexBasedLocationExtractionStrategy.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,25 @@ public int compare(IPdfTextLocation l1, IPdfTextLocation l2) {
110110
}
111111
});
112112

113+
// ligatures can produces same rectangle
114+
removeDuplicates(retval);
115+
113116
return retval;
114117
}
115118

119+
private void removeDuplicates(List<IPdfTextLocation> sortedList) {
120+
IPdfTextLocation lastItem = null;
121+
int orgSize = sortedList.size();
122+
for (int i = orgSize - 1; i >= 0; i--) {
123+
IPdfTextLocation currItem = sortedList.get(i);
124+
Rectangle currRect = currItem.getRectangle();
125+
if (lastItem != null && currRect.equalsWithEpsilon(lastItem.getRectangle())) {
126+
sortedList.remove(currItem);
127+
}
128+
lastItem = currItem;
129+
}
130+
}
131+
116132
@Override
117133
public void eventOccurred(IEventData data, EventType type) {
118134
if (data instanceof TextRenderInfo) {

kernel/src/test/java/com/itextpdf/kernel/pdf/canvas/parser/listener/RegexBasedLocationExtractionStrategyTest.java

Lines changed: 112 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ This file is part of the iText (R) project.
4242
*/
4343
package com.itextpdf.kernel.pdf.canvas.parser.listener;
4444

45+
import com.itextpdf.kernel.geom.Rectangle;
4546
import com.itextpdf.kernel.pdf.PdfDocument;
4647
import com.itextpdf.kernel.pdf.PdfReader;
4748
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
@@ -84,11 +85,11 @@ public void test01() throws IOException {
8485
}
8586

8687
// compare
87-
Assert.assertEquals(locationList.size(), 1);
88+
Assert.assertEquals(1, locationList.size());
8889

8990
IPdfTextLocation loc = locationList.get(0);
9091

91-
Assert.assertEquals(loc.getText(), "{{Signature}}");
92+
Assert.assertEquals("{{Signature}}", loc.getText());
9293
Assert.assertEquals(23, (int) loc.getRectangle().getX());
9394
Assert.assertEquals(375, (int) loc.getRectangle().getY());
9495
Assert.assertEquals(55, (int) loc.getRectangle().getWidth());
@@ -97,4 +98,113 @@ public void test01() throws IOException {
9798
// close
9899
pdfDocument.close();
99100
}
101+
102+
103+
// https://jira.itextsupport.com/browse/DEVSIX-1940
104+
// text is 'calligraphy' and 'll' is composing a ligature
105+
106+
@Test
107+
public void testLigatureBeforeLigature() throws IOException {
108+
System.out.println(new File(sourceFolder).getAbsolutePath());
109+
110+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "ligature.pdf"));
111+
112+
// build strategy
113+
RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("ca");
114+
115+
// get locations
116+
List<IPdfTextLocation> locationList = new ArrayList<>();
117+
for (int x = 1; x <= pdfDocument.getNumberOfPages(); x++) {
118+
new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(x));
119+
for(IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
120+
if(location != null) {
121+
locationList.add(location);
122+
}
123+
}
124+
}
125+
126+
// compare
127+
Assert.assertEquals(1, locationList.size());
128+
129+
IPdfTextLocation loc = locationList.get(0);
130+
131+
Assert.assertEquals("ca", loc.getText());
132+
Rectangle rect = loc.getRectangle();
133+
Assert.assertEquals(36, rect.getX(), 0.0001);
134+
Assert.assertEquals(655.4600, rect.getY(), 0.0001);
135+
Assert.assertEquals(25.1000, rect.getWidth(), 0.0001);
136+
Assert.assertEquals(20, rect.getHeight(), 0.0001);
137+
138+
pdfDocument.close();
139+
}
140+
141+
@Test
142+
public void testLigatureCrossLigature() throws IOException {
143+
System.out.println(new File(sourceFolder).getAbsolutePath());
144+
145+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "ligature.pdf"));
146+
147+
// build strategy
148+
RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("al");
149+
150+
// get locations
151+
List<IPdfTextLocation> locationList = new ArrayList<>();
152+
for (int x = 1; x <= pdfDocument.getNumberOfPages(); x++) {
153+
new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(x));
154+
for(IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
155+
if(location != null) {
156+
locationList.add(location);
157+
}
158+
}
159+
}
160+
161+
// compare
162+
Assert.assertEquals(1, locationList.size());
163+
164+
IPdfTextLocation loc = locationList.get(0);
165+
166+
Assert.assertEquals("al", loc.getText());
167+
Rectangle rect = loc.getRectangle();
168+
Assert.assertEquals(48.7600, rect.getX(), 0.0001);
169+
Assert.assertEquals(655.4600, rect.getY(), 0.0001);
170+
Assert.assertEquals(25.9799, rect.getWidth(), 0.0001);
171+
Assert.assertEquals(20, rect.getHeight(), 0.0001);
172+
173+
pdfDocument.close();
174+
}
175+
176+
@Test
177+
public void testLigatureInLigature() throws IOException {
178+
System.out.println(new File(sourceFolder).getAbsolutePath());
179+
180+
PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "ligature.pdf"));
181+
182+
// build strategy
183+
RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("l");
184+
185+
// get locations
186+
List<IPdfTextLocation> locationList = new ArrayList<>();
187+
for (int x = 1; x <= pdfDocument.getNumberOfPages(); x++) {
188+
new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(x));
189+
for(IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
190+
if(location != null) {
191+
locationList.add(location);
192+
}
193+
}
194+
}
195+
196+
// compare
197+
Assert.assertEquals(1, locationList.size());
198+
199+
IPdfTextLocation loc = locationList.get(0);
200+
201+
Assert.assertEquals("l", loc.getText());
202+
Rectangle rect = loc.getRectangle();
203+
Assert.assertEquals(61.0999, rect.getX(), 0.0001);
204+
Assert.assertEquals(655.4600, rect.getY(), 0.0001);
205+
Assert.assertEquals(13.6399, rect.getWidth(), 0.0001);
206+
Assert.assertEquals(20, rect.getHeight(), 0.0001);
207+
208+
pdfDocument.close();
209+
}
100210
}

0 commit comments

Comments
 (0)