1
1
package com .itextpdf .kernel .utils ;
2
2
3
3
import com .itextpdf .kernel .PdfException ;
4
+ import com .itextpdf .kernel .parser .EventData ;
5
+ import com .itextpdf .kernel .parser .EventListener ;
6
+ import com .itextpdf .kernel .parser .EventType ;
7
+ import com .itextpdf .kernel .parser .LocationTextExtractionStrategy ;
8
+ import com .itextpdf .kernel .parser .PdfCanvasProcessor ;
9
+ import com .itextpdf .kernel .parser .SimpleTextExtractionStrategy ;
10
+ import com .itextpdf .kernel .parser .TextExtractionStrategy ;
11
+ import com .itextpdf .kernel .parser .TextRenderInfo ;
4
12
import com .itextpdf .kernel .pdf .PdfArray ;
5
13
import com .itextpdf .kernel .pdf .PdfDictionary ;
6
14
import com .itextpdf .kernel .pdf .PdfDocument ;
7
15
import com .itextpdf .kernel .pdf .PdfName ;
8
16
import com .itextpdf .kernel .pdf .PdfObject ;
17
+ import com .itextpdf .kernel .pdf .PdfPage ;
9
18
import com .itextpdf .kernel .pdf .PdfString ;
10
19
import com .itextpdf .kernel .pdf .tagging .IPdfStructElem ;
11
20
import com .itextpdf .kernel .pdf .tagging .PdfMcr ;
21
+ import com .itextpdf .kernel .pdf .tagging .PdfObjRef ;
12
22
import com .itextpdf .kernel .pdf .tagging .PdfStructElem ;
13
23
import com .itextpdf .kernel .pdf .tagging .PdfStructTreeRoot ;
14
24
15
25
import java .io .IOException ;
16
26
import java .io .OutputStream ;
17
27
import java .io .OutputStreamWriter ;
18
28
import java .io .PrintWriter ;
29
+ import java .util .HashMap ;
19
30
import java .util .List ;
20
31
import java .util .Map ;
32
+ import java .util .Set ;
21
33
22
34
/**
23
35
* Converts a tagged PDF document into an XML file.
24
- * TODO Currently resultant xml file contains only pdf structure. See #parseTag method
25
- * TODO Also, this class should be moved to some other package in future
26
36
*/
27
37
public class TaggedPdfReaderTool {
28
38
29
39
protected PdfDocument document ;
30
40
protected PrintWriter out ;
31
41
protected String rootTag ;
32
42
43
+ // key - page dictionary; value pairs of mcid and text in them
44
+ protected Map <PdfDictionary , Map <Integer , String > > parsedTags = new HashMap <>();
45
+
33
46
public TaggedPdfReaderTool (PdfDocument document ) {
34
47
this .document = document ;
35
48
}
@@ -109,8 +122,6 @@ protected void inspectAttributes(PdfStructElem kid) {
109
122
PdfObject attrObj = kid .getAttributes (false );
110
123
111
124
if (attrObj != null ) {
112
- //TODO may be improve attributes handling:
113
- //there may be several attributes objects, and each of them may be followed by a number, which specifies revision
114
125
PdfDictionary attrDict ;
115
126
if (attrObj instanceof PdfArray ) {
116
127
attrDict = ((PdfArray ) attrObj ).getAsDictionary (0 );
@@ -130,18 +141,32 @@ protected void inspectAttributes(PdfStructElem kid) {
130
141
131
142
protected void parseTag (PdfMcr kid ) {
132
143
Integer mcid = kid .getMcid ();
133
- PdfDictionary page = kid .getPageObject ();
144
+ PdfDictionary pageDic = kid .getPageObject ();
134
145
146
+ String tagContent = "" ;
135
147
if (mcid != null ) {
136
- //TODO extract content of the tag, when some analog of PdfContentStreamProcessor is implemented
148
+ if (!parsedTags .containsKey (pageDic )) {
149
+ MarkedContentEventListener listener = new MarkedContentEventListener ();
150
+
151
+ PdfCanvasProcessor processor = new PdfCanvasProcessor (listener );
152
+ PdfPage page = document .getCatalog ().getPage (pageDic );
153
+ processor .processContent (page .getContentBytes (), page .getResources ());
154
+
155
+ parsedTags .put (pageDic , listener .getMcidContent ());
156
+ }
137
157
138
- //TODO also suggest implementing some caching logic, that will parse content stream only once:
139
- // it will extract all page's tags content at same time, saving all this contents in some map
140
- // see CmpTaggedPdfReaderTool in itext5
158
+ if (parsedTags .get (pageDic ).containsKey (mcid ))
159
+ tagContent = parsedTags .get (pageDic ).get (mcid );
141
160
142
- //temporary logic
143
- out .println (String .format ("Page %s MCID %s" , page .getIndirectReference ().toString (), mcid .toString ()));
161
+ } else {
162
+ PdfObjRef objRef = (PdfObjRef ) kid ;
163
+ PdfObject object = objRef .getReferencedObject ();
164
+ if (object .isDictionary ()) {
165
+ PdfName subtype = ((PdfDictionary ) object ).getAsName (PdfName .Subtype );
166
+ tagContent = subtype .toString ();
167
+ }
144
168
}
169
+ out .print (escapeXML (tagContent , true ));
145
170
}
146
171
147
172
protected static String fixTagName (String tag ) {
@@ -184,4 +209,96 @@ protected static String fixTagName(String tag) {
184
209
}
185
210
return sb .toString ();
186
211
}
212
+
213
+ /**
214
+ * NOTE: copied from itext5 XMLUtils class
215
+ *
216
+ * Escapes a string with the appropriated XML codes.
217
+ * @param s the string to be escaped
218
+ * @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE>
219
+ * @return the escaped string
220
+ * @since 5.0.6
221
+ */
222
+ protected static String escapeXML (final String s , final boolean onlyASCII ) {
223
+ char cc [] = s .toCharArray ();
224
+ int len = cc .length ;
225
+ StringBuffer sb = new StringBuffer ();
226
+ for (int k = 0 ; k < len ; ++k ) {
227
+ int c = cc [k ];
228
+ switch (c ) {
229
+ case '<' :
230
+ sb .append ("<" );
231
+ break ;
232
+ case '>' :
233
+ sb .append (">" );
234
+ break ;
235
+ case '&' :
236
+ sb .append ("&" );
237
+ break ;
238
+ case '"' :
239
+ sb .append (""" );
240
+ break ;
241
+ case '\'' :
242
+ sb .append ("'" );
243
+ break ;
244
+ default :
245
+ if (isValidCharacterValue (c )) {
246
+ if (onlyASCII && c > 127 )
247
+ sb .append ("&#" ).append (c ).append (';' );
248
+ else
249
+ sb .append ((char )c );
250
+ }
251
+ }
252
+ }
253
+ return sb .toString ();
254
+ }
255
+
256
+ /**
257
+ * Checks if a character value should be escaped/unescaped.
258
+ * @param c a character value
259
+ * @return true if it's OK to escape or unescape this value
260
+ */
261
+ public static boolean isValidCharacterValue (int c ) {
262
+ return (c == 0x9 || c == 0xA || c == 0xD
263
+ || c >= 0x20 && c <= 0xD7FF
264
+ || c >= 0xE000 && c <= 0xFFFD
265
+ || c >= 0x10000 && c <= 0x10FFFF );
266
+ }
267
+
268
+ private class MarkedContentEventListener implements EventListener {
269
+ private Map <Integer , TextExtractionStrategy > contentByMcid = new HashMap <>();
270
+
271
+ public Map <Integer , String > getMcidContent () {
272
+ Map <Integer , String > content = new HashMap <>();
273
+ for (int id : contentByMcid .keySet ()) {
274
+ content .put (id , contentByMcid .get (id ).getResultantText ());
275
+ }
276
+ return content ;
277
+ }
278
+
279
+ @ Override
280
+ public void eventOccurred (EventData data , EventType type ) {
281
+ switch (type ) {
282
+ case RENDER_TEXT :
283
+ TextRenderInfo textInfo = (TextRenderInfo ) data ;
284
+ Integer mcid = textInfo .getMcid ();
285
+ if (mcid != null ) {
286
+ TextExtractionStrategy textExtractionStrategy = contentByMcid .get (mcid );
287
+ if (textExtractionStrategy == null ) {
288
+ textExtractionStrategy = new LocationTextExtractionStrategy ();
289
+ contentByMcid .put (mcid , textExtractionStrategy );
290
+ }
291
+ textExtractionStrategy .eventOccurred (data , type );
292
+ }
293
+ break ;
294
+ default :
295
+ break ;
296
+ }
297
+ }
298
+
299
+ @ Override
300
+ public Set <EventType > getSupportedEvents () {
301
+ return null ;
302
+ }
303
+ }
187
304
}
0 commit comments