Skip to content

Commit a568a26

Browse files
Improve TagStructureReaderTool
Now, instead of just MCIDs, it puts in xmls actual tag content which is parsed from page.
1 parent dc82f37 commit a568a26

File tree

5 files changed

+4734
-6474
lines changed

5 files changed

+4734
-6474
lines changed

kernel/src/main/java/com/itextpdf/kernel/parser/GlyphTextEventListener.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
/**
44
* This class expands each {@link TextRenderInfo} for {@link EventType#RENDER_TEXT} event types into
55
* multiple {@link TextRenderInfo} instances for each glyph occurred.
6-
* The only difference from {@link FilteredEventListener} is that this class conveniently implements
6+
* The only difference from {@link GlyphEventListener} is that this class conveniently implements
77
* {@link TextExtractionStrategy} and can therefore used as a strategy on its own.
88
*/
99
public class GlyphTextEventListener extends GlyphEventListener implements TextExtractionStrategy {

kernel/src/main/java/com/itextpdf/kernel/pdf/tagging/PdfObjRef.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import com.itextpdf.kernel.pdf.PdfDictionary;
44
import com.itextpdf.kernel.pdf.PdfName;
5+
import com.itextpdf.kernel.pdf.PdfObject;
56
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
67

78
public class PdfObjRef extends PdfMcr<PdfDictionary> {
@@ -33,4 +34,8 @@ public PdfDictionary getPageObject() {
3334
return page;
3435
}
3536

37+
public PdfObject getReferencedObject() {
38+
return ((PdfDictionary) getPdfObject()).get(PdfName.Obj);
39+
}
40+
3641
}

kernel/src/main/java/com/itextpdf/kernel/utils/CompareTool.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,10 +1070,6 @@ public int compare(File f1, File f2) {
10701070
}
10711071
}
10721072

1073-
//TODO: these helper classes for "compareTagStructures" method should be implemented after implementing of TaggedPdfReaderTool
1074-
//private class CmpTaggedPdfReaderTool extends TaggedPdfReaderTool { ... }
1075-
//private class CmpMarkedContentRenderFilter implements RenderListener { ... }
1076-
10771073
private class CompareResult {
10781074
// LinkedHashMap to retain order. HashMap has different order in Java6/7 and Java8
10791075
protected Map<ObjectPath, String> differences = new LinkedHashMap<>();

kernel/src/main/java/com/itextpdf/kernel/utils/TaggedPdfReaderTool.java

Lines changed: 128 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,48 @@
11
package com.itextpdf.kernel.utils;
22

33
import com.itextpdf.kernel.PdfException;
4+
import com.itextpdf.kernel.parser.EventData;
5+
import com.itextpdf.kernel.parser.EventListener;
6+
import com.itextpdf.kernel.parser.EventType;
7+
import com.itextpdf.kernel.parser.LocationTextExtractionStrategy;
8+
import com.itextpdf.kernel.parser.PdfCanvasProcessor;
9+
import com.itextpdf.kernel.parser.SimpleTextExtractionStrategy;
10+
import com.itextpdf.kernel.parser.TextExtractionStrategy;
11+
import com.itextpdf.kernel.parser.TextRenderInfo;
412
import com.itextpdf.kernel.pdf.PdfArray;
513
import com.itextpdf.kernel.pdf.PdfDictionary;
614
import com.itextpdf.kernel.pdf.PdfDocument;
715
import com.itextpdf.kernel.pdf.PdfName;
816
import com.itextpdf.kernel.pdf.PdfObject;
17+
import com.itextpdf.kernel.pdf.PdfPage;
918
import com.itextpdf.kernel.pdf.PdfString;
1019
import com.itextpdf.kernel.pdf.tagging.IPdfStructElem;
1120
import com.itextpdf.kernel.pdf.tagging.PdfMcr;
21+
import com.itextpdf.kernel.pdf.tagging.PdfObjRef;
1222
import com.itextpdf.kernel.pdf.tagging.PdfStructElem;
1323
import com.itextpdf.kernel.pdf.tagging.PdfStructTreeRoot;
1424

1525
import java.io.IOException;
1626
import java.io.OutputStream;
1727
import java.io.OutputStreamWriter;
1828
import java.io.PrintWriter;
29+
import java.util.HashMap;
1930
import java.util.List;
2031
import java.util.Map;
32+
import java.util.Set;
2133

2234
/**
2335
* Converts a tagged PDF document into an XML file.
24-
* TODO Currently resultant xml file contains only pdf structure. See #parseTag method
25-
* TODO Also, this class should be moved to some other package in future
2636
*/
2737
public class TaggedPdfReaderTool {
2838

2939
protected PdfDocument document;
3040
protected PrintWriter out;
3141
protected String rootTag;
3242

43+
// key - page dictionary; value pairs of mcid and text in them
44+
protected Map<PdfDictionary, Map<Integer, String> > parsedTags = new HashMap<>();
45+
3346
public TaggedPdfReaderTool(PdfDocument document) {
3447
this.document = document;
3548
}
@@ -109,8 +122,6 @@ protected void inspectAttributes(PdfStructElem kid) {
109122
PdfObject attrObj = kid.getAttributes(false);
110123

111124
if (attrObj != null) {
112-
//TODO may be improve attributes handling:
113-
//there may be several attributes objects, and each of them may be followed by a number, which specifies revision
114125
PdfDictionary attrDict;
115126
if (attrObj instanceof PdfArray) {
116127
attrDict = ((PdfArray) attrObj).getAsDictionary(0);
@@ -130,18 +141,32 @@ protected void inspectAttributes(PdfStructElem kid) {
130141

131142
protected void parseTag(PdfMcr kid) {
132143
Integer mcid = kid.getMcid();
133-
PdfDictionary page = kid.getPageObject();
144+
PdfDictionary pageDic = kid.getPageObject();
134145

146+
String tagContent = "";
135147
if (mcid != null) {
136-
//TODO extract content of the tag, when some analog of PdfContentStreamProcessor is implemented
148+
if (!parsedTags.containsKey(pageDic)) {
149+
MarkedContentEventListener listener = new MarkedContentEventListener();
150+
151+
PdfCanvasProcessor processor = new PdfCanvasProcessor(listener);
152+
PdfPage page = document.getCatalog().getPage(pageDic);
153+
processor.processContent(page.getContentBytes(), page.getResources());
154+
155+
parsedTags.put(pageDic, listener.getMcidContent());
156+
}
137157

138-
//TODO also suggest implementing some caching logic, that will parse content stream only once:
139-
// it will extract all page's tags content at same time, saving all this contents in some map
140-
// see CmpTaggedPdfReaderTool in itext5
158+
if (parsedTags.get(pageDic).containsKey(mcid))
159+
tagContent = parsedTags.get(pageDic).get(mcid);
141160

142-
//temporary logic
143-
out.println(String.format("Page %s MCID %s", page.getIndirectReference().toString(), mcid.toString()));
161+
} else {
162+
PdfObjRef objRef = (PdfObjRef) kid;
163+
PdfObject object = objRef.getReferencedObject();
164+
if (object.isDictionary()) {
165+
PdfName subtype = ((PdfDictionary) object).getAsName(PdfName.Subtype);
166+
tagContent = subtype.toString();
167+
}
144168
}
169+
out.print(escapeXML(tagContent, true));
145170
}
146171

147172
protected static String fixTagName(String tag) {
@@ -184,4 +209,96 @@ protected static String fixTagName(String tag) {
184209
}
185210
return sb.toString();
186211
}
212+
213+
/**
214+
* NOTE: copied from itext5 XMLUtils class
215+
*
216+
* Escapes a string with the appropriated XML codes.
217+
* @param s the string to be escaped
218+
* @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>
219+
* @return the escaped string
220+
* @since 5.0.6
221+
*/
222+
protected static String escapeXML(final String s, final boolean onlyASCII) {
223+
char cc[] = s.toCharArray();
224+
int len = cc.length;
225+
StringBuffer sb = new StringBuffer();
226+
for (int k = 0; k < len; ++k) {
227+
int c = cc[k];
228+
switch (c) {
229+
case '<':
230+
sb.append("&lt;");
231+
break;
232+
case '>':
233+
sb.append("&gt;");
234+
break;
235+
case '&':
236+
sb.append("&amp;");
237+
break;
238+
case '"':
239+
sb.append("&quot;");
240+
break;
241+
case '\'':
242+
sb.append("&apos;");
243+
break;
244+
default:
245+
if (isValidCharacterValue(c)) {
246+
if (onlyASCII && c > 127)
247+
sb.append("&#").append(c).append(';');
248+
else
249+
sb.append((char)c);
250+
}
251+
}
252+
}
253+
return sb.toString();
254+
}
255+
256+
/**
257+
* Checks if a character value should be escaped/unescaped.
258+
* @param c a character value
259+
* @return true if it's OK to escape or unescape this value
260+
*/
261+
public static boolean isValidCharacterValue(int c) {
262+
return (c == 0x9 || c == 0xA || c == 0xD
263+
|| c >= 0x20 && c <= 0xD7FF
264+
|| c >= 0xE000 && c <= 0xFFFD
265+
|| c >= 0x10000 && c <= 0x10FFFF);
266+
}
267+
268+
private class MarkedContentEventListener implements EventListener {
269+
private Map<Integer, TextExtractionStrategy> contentByMcid = new HashMap<>();
270+
271+
public Map<Integer, String> getMcidContent() {
272+
Map<Integer, String> content = new HashMap<>();
273+
for (int id : contentByMcid.keySet()) {
274+
content.put(id, contentByMcid.get(id).getResultantText());
275+
}
276+
return content;
277+
}
278+
279+
@Override
280+
public void eventOccurred(EventData data, EventType type) {
281+
switch (type) {
282+
case RENDER_TEXT:
283+
TextRenderInfo textInfo = (TextRenderInfo) data;
284+
Integer mcid = textInfo.getMcid();
285+
if (mcid != null) {
286+
TextExtractionStrategy textExtractionStrategy = contentByMcid.get(mcid);
287+
if (textExtractionStrategy == null) {
288+
textExtractionStrategy = new LocationTextExtractionStrategy();
289+
contentByMcid.put(mcid, textExtractionStrategy);
290+
}
291+
textExtractionStrategy.eventOccurred(data, type);
292+
}
293+
break;
294+
default:
295+
break;
296+
}
297+
}
298+
299+
@Override
300+
public Set<EventType> getSupportedEvents() {
301+
return null;
302+
}
303+
}
187304
}

0 commit comments

Comments
 (0)