Skip to content

Commit 0f255bf

Browse files
author
joris.schellekens
committed
move functionality from pdfAutoSweep to kernel. RegexBasedLocationExtractionStrategy offers users the possibility of defining a regular expression and having all locations where it occurs returned to them
1 parent cf2ff95 commit 0f255bf

File tree

9 files changed

+435
-0
lines changed

9 files changed

+435
-0
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package com.itextpdf.kernel.pdf.canvas.parser.listener;
2+
3+
import com.itextpdf.kernel.geom.LineSegment;
4+
import com.itextpdf.kernel.geom.Rectangle;
5+
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
6+
7+
import java.util.HashMap;
8+
import java.util.List;
9+
import java.util.Map;
10+
11+
/**
12+
* This class represents a single character and its bounding box
13+
*/
14+
public class CharacterRenderInfo extends LocationTextExtractionStrategy.TextChunk {
15+
16+
private Rectangle boundingBox;
17+
18+
/**
19+
* This method converts a List<CharacterRenderInfo>
20+
* The datastructure that gets returned contains both the plaintext,
21+
* as well as the mapping of indices (from the list to the string).
22+
* These indices can differ; if there is sufficient spacing between two CharacterRenderInfo
23+
* objects, this algorithm will decide to insert space. The inserted space will cause
24+
* the indices to differ by at least 1.
25+
*
26+
* @param cris
27+
* @return
28+
*/
29+
static StringConversionInfo mapString(List<CharacterRenderInfo> cris) {
30+
Map<Integer, Integer> indexMap = new HashMap<>();
31+
StringBuilder sb = new StringBuilder();
32+
CharacterRenderInfo lastChunk = null;
33+
for (int i = 0; i < cris.size(); i++) {
34+
CharacterRenderInfo chunk = cris.get(i);
35+
if (lastChunk == null) {
36+
indexMap.put(sb.length(), i);
37+
sb.append(chunk.getText());
38+
} else {
39+
if (chunk.sameLine(lastChunk)) {
40+
// we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
41+
if (chunk.getLocation().isAtWordBoundary(lastChunk.getLocation()) && !chunk.getText().startsWith(" ") && !chunk.getText().endsWith(" ")) {
42+
sb.append(' ');
43+
}
44+
indexMap.put(sb.length(), i);
45+
sb.append(chunk.getText());
46+
} else {
47+
indexMap.put(sb.length(), i);
48+
sb.append(chunk.getText());
49+
}
50+
}
51+
lastChunk = chunk;
52+
}
53+
CharacterRenderInfo.StringConversionInfo ret = new StringConversionInfo();
54+
ret.indexMap = indexMap;
55+
ret.text = sb.toString();
56+
return ret;
57+
}
58+
59+
public CharacterRenderInfo(TextRenderInfo tri) {
60+
super(tri == null ? "" : tri.getText(), tri == null ? null : getLocation(tri));
61+
if (tri == null)
62+
throw new IllegalArgumentException("TextRenderInfo argument is not nullable.");
63+
if (tri.getText().length() != 1)
64+
throw new IllegalArgumentException("CharacterRenderInfo objects represent a single character. They should not be made from TextRenderInfo objects containing more than a single character of text.");
65+
66+
// determine bounding box
67+
float x0 = tri.getDescentLine().getStartPoint().get(0);
68+
float y0 = tri.getDescentLine().getStartPoint().get(1);
69+
float h = tri.getAscentLine().getStartPoint().get(1) - tri.getDescentLine().getStartPoint().get(1);
70+
float w = Math.abs(tri.getBaseline().getStartPoint().get(0) - tri.getBaseline().getEndPoint().get(0));
71+
this.boundingBox = new Rectangle(x0, y0, w, h);
72+
}
73+
74+
public Rectangle getBoundingBox() {
75+
return boundingBox;
76+
}
77+
78+
private static LocationTextExtractionStrategy.ITextChunkLocation getLocation(TextRenderInfo tri) {
79+
LineSegment baseline = tri.getBaseline();
80+
return new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(baseline.getStartPoint(),
81+
baseline.getEndPoint(),
82+
tri.getSingleSpaceWidth());
83+
}
84+
85+
static class StringConversionInfo {
86+
Map<Integer, Integer> indexMap;
87+
String text;
88+
}
89+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package com.itextpdf.kernel.pdf.canvas.parser.listener;
2+
3+
import com.itextpdf.kernel.geom.Rectangle;
4+
5+
/**
6+
* This class acts as a default implementation of IPdfTextLocation
7+
*/
8+
public class DefaultPdfTextLocation implements IPdfTextLocation {
9+
10+
private int pageNr;
11+
private Rectangle rectangle;
12+
private String text;
13+
14+
public DefaultPdfTextLocation(int pageNr, Rectangle rect, String text) {
15+
this.pageNr = pageNr;
16+
this.rectangle = rect;
17+
this.text = text;
18+
}
19+
20+
@Override
21+
public Rectangle getRectangle() {
22+
return rectangle;
23+
}
24+
25+
public DefaultPdfTextLocation setRectangle(Rectangle rectangle) {
26+
this.rectangle = rectangle;
27+
return this;
28+
}
29+
30+
@Override
31+
public String getText() {
32+
return text;
33+
}
34+
35+
public DefaultPdfTextLocation setText(String text) {
36+
this.text = text;
37+
return this;
38+
}
39+
40+
@Override
41+
public int getPageNumber() {
42+
return pageNr;
43+
}
44+
45+
public DefaultPdfTextLocation setPageNr(int pageNr) {
46+
this.pageNr = pageNr;
47+
return this;
48+
}
49+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package com.itextpdf.kernel.pdf.canvas.parser.listener;
2+
3+
import com.itextpdf.kernel.geom.Rectangle;
4+
import com.itextpdf.kernel.pdf.canvas.parser.filter.IEventFilter;
5+
6+
import java.util.Collection;
7+
8+
/**
9+
* This is a special interface for {@link IEventFilter} that returns a collection of rectangles as result of its work.
10+
*/
11+
public interface ILocationExtractionStrategy extends IEventListener {
12+
13+
/**
14+
* Returns the rectangles that have been processed so far.
15+
*
16+
* @return {@link Collection<IPdfTextLocation>} instance with the current resultant IPdfTextLocations
17+
*/
18+
Collection<IPdfTextLocation> getResultantLocations();
19+
20+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package com.itextpdf.kernel.pdf.canvas.parser.listener;
2+
3+
import com.itextpdf.kernel.geom.Rectangle;
4+
5+
/**
6+
* Instances of this interface represent a piece of text,
7+
* somewhere on a page in a pdf document.
8+
*/
9+
public interface IPdfTextLocation {
10+
11+
/**
12+
* Get the visual rectangle in which the text is located
13+
*
14+
* @return
15+
*/
16+
Rectangle getRectangle();
17+
18+
/**
19+
* Get the text
20+
*
21+
* @return
22+
*/
23+
String getText();
24+
25+
/**
26+
* Get the page number of the page on which the text is located
27+
*
28+
* @return the page number, or 0 if no page number was set
29+
*/
30+
int getPageNumber();
31+
32+
}

kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/listener/ITextExtractionStrategy.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public interface ITextExtractionStrategy extends IEventListener {
5252

5353
/**
5454
* Returns the text that has been processed so far.
55+
*
5556
* @return {@link String} instance with the current resultant text
5657
*/
5758
String getResultantText();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
package com.itextpdf.kernel.pdf.canvas.parser.listener;
2+
3+
import com.itextpdf.kernel.geom.Rectangle;
4+
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
5+
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
6+
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
7+
8+
import javax.xml.soap.Text;
9+
import java.util.*;
10+
import java.util.regex.Matcher;
11+
import java.util.regex.Pattern;
12+
13+
/**
14+
* This class is designed to search for the occurrences of a regular expression and return the resultant rectangles.
15+
*/
16+
public class RegexBasedLocationExtractionStrategy implements ILocationExtractionStrategy {
17+
18+
private Pattern pattern;
19+
private List<CharacterRenderInfo> parseResult = new ArrayList<>();
20+
21+
public RegexBasedLocationExtractionStrategy(String regex) {
22+
this.pattern = Pattern.compile(regex);
23+
}
24+
25+
public RegexBasedLocationExtractionStrategy(Pattern pattern) {
26+
this.pattern = pattern;
27+
}
28+
29+
@Override
30+
public Collection<IPdfTextLocation> getResultantLocations() {
31+
32+
// align characters in "logical" order
33+
Collections.sort(parseResult);
34+
35+
// process parse results
36+
List<IPdfTextLocation> retval = new ArrayList<>();
37+
38+
CharacterRenderInfo.StringConversionInfo txt = CharacterRenderInfo.mapString(parseResult);
39+
40+
Matcher mat = pattern.matcher(txt.text);
41+
while (mat.find()) {
42+
int startIndex = txt.indexMap.get(mat.start());
43+
int endIndex = txt.indexMap.get(mat.end());
44+
for (Rectangle r : toRectangles(parseResult.subList(startIndex, endIndex))) {
45+
retval.add(new DefaultPdfTextLocation(0, r, mat.group(0)));
46+
}
47+
}
48+
49+
/* sort
50+
* even though the return type is Collection<Rectangle>, we apply a sorting algorithm here
51+
* This is to ensure that tests that use this functionality (for instance to generate pdf with
52+
* areas of interest highlighted) will not break when compared.
53+
*/
54+
java.util.Collections.sort(retval, new Comparator<IPdfTextLocation>() {
55+
@Override
56+
public int compare(IPdfTextLocation l1, IPdfTextLocation l2) {
57+
Rectangle o1 = l1.getRectangle();
58+
Rectangle o2 = l2.getRectangle();
59+
if (o1.getY() == o2.getY()) {
60+
return o1.getX() == o2.getX() ? 0 : (o1.getX() < o2.getX() ? -1 : 1);
61+
} else {
62+
return o1.getY() < o2.getY() ? -1 : 1;
63+
}
64+
}
65+
});
66+
67+
return retval;
68+
}
69+
70+
@Override
71+
public void eventOccurred(IEventData data, EventType type) {
72+
if (data instanceof TextRenderInfo) {
73+
parseResult.addAll(toCRI((TextRenderInfo) data));
74+
}
75+
}
76+
77+
@Override
78+
public Set<EventType> getSupportedEvents() {
79+
return null;
80+
}
81+
82+
/**
83+
* Convert {@code TextRenderInfo} to {@code CharacterRenderInfo}
84+
* This method is public and not final so that custom implementations can choose to override it.
85+
* Other implementations of {@code CharacterRenderInfo} may choose to store different properties than
86+
* merely the {@code Rectangle} describing the bounding box. E.g. a custom implementation might choose to
87+
* store {@code Color} information as well, to better match the content surrounding the redaction {@code Rectangle}.
88+
*
89+
* @param tri
90+
* @return
91+
*/
92+
protected List<CharacterRenderInfo> toCRI(TextRenderInfo tri) {
93+
List<CharacterRenderInfo> cris = new ArrayList<>();
94+
for (TextRenderInfo subTri : tri.getCharacterRenderInfos()) {
95+
cris.add(new CharacterRenderInfo(subTri));
96+
}
97+
return cris;
98+
}
99+
100+
/**
101+
* Converts {@code CharacterRenderInfo} objects to {@code Rectangles}
102+
* This method is protected and not final so that custom implementations can choose to override it.
103+
* E.g. other implementations may choose to add padding/margin to the Rectangles.
104+
* This method also offers a convenient access point to the mapping of {@code CharacterRenderInfo} to {@code Rectangle}.
105+
* This mapping enables (custom implementations) to match color of text in redacted Rectangles,
106+
* or match color of background, by the mere virtue of offering access to the {@code CharacterRenderInfo} objects
107+
* that generated the {@code Rectangle}.
108+
*
109+
* @param cris
110+
* @return
111+
*/
112+
protected List<Rectangle> toRectangles(List<CharacterRenderInfo> cris) {
113+
List<Rectangle> retval = new ArrayList<>();
114+
if (cris.isEmpty())
115+
return retval;
116+
117+
int prev = 0;
118+
int curr = 0;
119+
while (curr < cris.size()) {
120+
while (curr < cris.size() && cris.get(curr).sameLine(cris.get(prev))) {
121+
curr++;
122+
}
123+
float x = cris.get(prev).getBoundingBox().getX();
124+
float y = cris.get(prev).getBoundingBox().getY();
125+
float w = cris.get(curr - 1).getBoundingBox().getX() - cris.get(prev).getBoundingBox().getX() + cris.get(curr - 1).getBoundingBox().getWidth();
126+
float h = 0f;
127+
for (CharacterRenderInfo cri : cris.subList(prev, curr)) {
128+
h = Math.max(h, cri.getBoundingBox().getHeight());
129+
}
130+
retval.add(new Rectangle(x, y, w, h));
131+
prev = curr;
132+
}
133+
134+
// return
135+
return retval;
136+
}
137+
138+
}

0 commit comments

Comments
 (0)