@@ -79,15 +79,11 @@ def test_get_text_lines_can_parse_faulty_broadcom_doc(self):
7979
8080 def test_pdfminer_can_parse_apache_fop_test_pdf (self ):
8181 test_file = self .get_test_loc ('pdf/fop_test_pdf_1.5_test.pdf' )
82- from pdfminer .pdfparser import PDFParser
83- from pdfminer .pdfdocument import PDFDocument
84- with open (test_file , 'rb' ) as inputfile :
85- parser = PDFParser (inputfile )
86- PDFDocument (parser )
87-
8882 result = pdf .get_text_lines (test_file )
89- expected = apache_fop_expected
90- assert result == expected
83+ try :
84+ assert result == apache_fop_expected
85+ except AssertionError :
86+ assert result == apache_fop_expected_2020
9187
9288 def test_numbered_text_lines_does_not_fail_on_autocad_test_pdf (self ):
9389 test_file = self .get_test_loc ('pdf/AutoCad_Diagram.pdf' )
@@ -221,3 +217,140 @@ def test_numbered_text_lines_does_not_fail_on_autocad_test_pdf(self):
221217 b'\n ' ,
222218 b'\x0c '
223219]
220+
221+ apache_fop_expected_2020 = [
222+ b'This is the page header\n ' ,
223+ b'\n ' ,
224+ b'(There\xe2 \x80 \x99 s another page se-\n ' ,
225+ b'quence below.)\n ' ,
226+ b'\n ' ,
227+ b'About Apache FOP\n ' ,
228+ b'It is a print formatter driv-\n ' ,
229+ b'en by XSL formatting ob-\n ' ,
230+ b'jects (XSL-FO) and an out-\n ' ,
231+ b'format-\n ' ,
232+ b'put \n ' ,
233+ b'\n ' ,
234+ b'independent \n ' ,
235+ b'\n ' ,
236+ b'Page 1\n ' ,
237+ b'ter1. FOP has a nice logo:\n ' ,
238+ b'\n ' ,
239+ b'Header 1.1 Header 1.2\n ' ,
240+ b'\n ' ,
241+ b'Cell 1.1\n ' ,
242+ b'\n ' ,
243+ b'Cell 1.2\n ' ,
244+ b'\n ' ,
245+ b'See the FOP website for more information\n ' ,
246+ b'\n ' ,
247+ b'\x0c This is the page header\n ' ,
248+ b'\n ' ,
249+ b'Page 2\n ' ,
250+ b'\n ' ,
251+ b'Cell 2.2\n ' ,
252+ b'\n ' ,
253+ b'Cell 2.1\n ' ,
254+ b'\n ' ,
255+ b'Header 1.1 Header 1.2\n ' ,
256+ b'\n ' ,
257+ b'(XSL-FO) and an output in-\n ' ,
258+ b'dependent formatter. It is a\n ' ,
259+ b'Java application that reads\n ' ,
260+ b'a formatting object (FO)\n ' ,
261+ b'tree and renders the res-\n ' ,
262+ b'ulting pages to a specified\n ' ,
263+ b'output.\n ' ,
264+ b'\n ' ,
265+ b'Apache FOP (Formatting\n ' ,
266+ b'Objects Processor) is a\n ' ,
267+ b'print formatter driven by\n ' ,
268+ b'XSL \n ' ,
269+ b'formatting objects\n ' ,
270+ b'This fo:block element spans all the columns of the docu-\n ' ,
271+ b'ment. This is intended to test the abilities of the text-to-\n ' ,
272+ b'speech program.\n ' ,
273+ b'And now we are back to\n ' ,
274+ b'normal content flowing in\n ' ,
275+ b'\n ' ,
276+ b'\x0c Page 3\n ' ,
277+ b'\n ' ,
278+ b'This is the page header\n ' ,
279+ b'\n ' ,
280+ b'two columns. Let\xe2 \x80 \x99 s start a\n ' ,
281+ b'numbered list:\n ' ,
282+ b'1. Line 1 of item 1\n ' ,
283+ b'Line 2 of item 1\n ' ,
284+ b'Line 3 of item 1\n ' ,
285+ b'2. Line 1 of item 2\n ' ,
286+ b'Line 2 of item 2\n ' ,
287+ b'Line 3 of item 2\n ' ,
288+ b'\n ' ,
289+ b'And now we are going to\n ' ,
290+ b'see how a second page\n ' ,
291+ b'sequence is handled.\n ' ,
292+ b'\n ' ,
293+ b'\x0c This is the page header\n ' ,
294+ b'\n ' ,
295+ b'Apache FOP (Formatting\n ' ,
296+ b'Objects Processor) is a\n ' ,
297+ b'print formatter driven by\n ' ,
298+ b'XSL \n ' ,
299+ b'formatting objects\n ' ,
300+ b'(XSL-FO) and an output\n ' ,
301+ b'independent formatter1. It\n ' ,
302+ b'is a Java application that\n ' ,
303+ b'reads a formatting object\n ' ,
304+ b'(FO) tree and renders the\n ' ,
305+ b'\n ' ,
306+ b'Page 4\n ' ,
307+ b'\n ' ,
308+ b'resulting pages to a spe-\n ' ,
309+ b'cified output.\n ' ,
310+ b'\n ' ,
311+ b'Header 1.1 Header 1.2\n ' ,
312+ b'\n ' ,
313+ b'Cell 1.1\n ' ,
314+ b'Cell 2.1\n ' ,
315+ b'\n ' ,
316+ b'Cell 1.2\n ' ,
317+ b'Cell 2.2\n ' ,
318+ b'\n ' ,
319+ b'Apache FOP (Formatting\n ' ,
320+ b'Objects Processor) est\n ' ,
321+ b'une application de mise en\n ' ,
322+ b'page de documents res-\n ' ,
323+ b'pectant le standard XSL-\n ' ,
324+ b'\n ' ,
325+ b'See the FOP website for more information\n ' ,
326+ b'\n ' ,
327+ b'\x0c This is the page header\n ' ,
328+ b'\n ' ,
329+ b'Page 5\n ' ,
330+ b'\n ' ,
331+ b'FO. \xc3 \x80 partir d\xe2 \x80 \x99 un document\n ' ,
332+ b'va effectue une mise en\n ' ,
333+ b'au format XSL-FO, cette\n ' ,
334+ b'page et renvoie un docu-\n ' ,
335+ b'ment pr\xc3 \xaa t pour impression.\n ' ,
336+ b'application \xc3 \xa9 crite en Ja-\n ' ,
337+ b'This fo:block element spans all the columns of the docu-\n ' ,
338+ b'ment. This is intended to test the abilities of the text-to-\n ' ,
339+ b'speech program.\n ' ,
340+ b'And now we are back to\n ' ,
341+ b'normal content flowing in\n ' ,
342+ b'two columns. Let\xe2 \x80 \x99 s start a\n ' ,
343+ b'numbered list:\n ' ,
344+ b'1. Line 1 of item 1\n ' ,
345+ b'Line 2 of item 1\n ' ,
346+ b'\n ' ,
347+ b'Line 3 of item 1\n ' ,
348+ b'2. Line 1 of item 2\n ' ,
349+ b'Line 2 of item 2\n ' ,
350+ b'Line 3 of item 2\n ' ,
351+ b'\n ' ,
352+ b'The end of the document\n ' ,
353+ b'has now been reached.\n ' ,
354+ b'\n ' ,
355+ b'\x0c ' ,
356+ ]
0 commit comments