@@ -80,9 +80,8 @@ def test_get_text_lines_can_parse_faulty_broadcom_doc(self):
8080 def test_pdfminer_can_parse_apache_fop_test_pdf (self ):
8181 test_file = self .get_test_loc ('pdf/fop_test_pdf_1.5_test.pdf' )
8282 result = pdf .get_text_lines (test_file )
83- if result == apache_fop_expected :
84- return
85- assert result == apache_fop_expected_2020
83+ for expected in apache_fop_expected :
84+ assert expected in result
8685
8786 def test_numbered_text_lines_does_not_fail_on_autocad_test_pdf (self ):
8887 test_file = self .get_test_loc ('pdf/AutoCad_Diagram.pdf' )
@@ -92,264 +91,17 @@ def test_numbered_text_lines_does_not_fail_on_autocad_test_pdf(self):
9291
9392apache_fop_expected = [
9493 b'This is the page header\n ' ,
95- b'(There\xe2 \x80 \x99 s another page se-\n ' ,
96- b'quence below.)\n ' ,
97- b'\n ' ,
9894 b'About Apache FOP\n ' ,
9995 b'It is a print formatter driv-\n ' ,
10096 b'en by XSL formatting ob-\n ' ,
10197 b'jects (XSL-FO) and an out-\n ' ,
102- b'put \n ' ,
103- b'format-\n ' ,
104- b'\n ' ,
105- b'independent \n ' ,
106- b'\n ' ,
107- b'Page 1\n ' ,
108- b'ter1. FOP has a nice logo:\n ' ,
109- b'\n ' ,
110- b'Header 1.1 Header 1.2\n ' ,
111- b'Cell 1.1\n ' ,
112- b'\n ' ,
113- b'Cell 1.2\n ' ,
114- b'\n ' ,
115- b'See the FOP website for more information\n ' ,
116- b'\n ' ,
117- b'\x0c This is the page header\n ' ,
118- b'\n ' ,
119- b'Header 1.1 Header 1.2\n ' ,
120- b'Cell 2.1\n ' ,
121- b'\n ' ,
122- b'Cell 2.2\n ' ,
123- b'\n ' ,
124- b'Page 2\n ' ,
12598 b'(XSL-FO) and an output in-\n ' ,
12699 b'dependent formatter. It is a\n ' ,
127100 b'Java application that reads\n ' ,
128- b'a formatting object (FO)\n ' ,
129- b'tree and renders the res-\n ' ,
130- b'ulting pages to a specified\n ' ,
131- b'output.\n ' ,
132- b'\n ' ,
133- b'Apache FOP (Formatting\n ' ,
134- b'Objects Processor) is a\n ' ,
135- b'print formatter driven by\n ' ,
136- b'XSL \n ' ,
137- b'formatting objects\n ' ,
138- b'This fo:block element spans all the columns of the docu-\n ' ,
139- b'ment. This is intended to test the abilities of the text-to-\n ' ,
140- b'speech program.\n ' ,
141- b'And now we are back to\n ' ,
142- b'normal content flowing in\n ' ,
143- b'\n ' ,
144- b'\x0c Page 3\n ' ,
145- b'\n ' ,
146- b'This is the page header\n ' ,
147- b'two columns. Let\xe2 \x80 \x99 s start a\n ' ,
148- b'numbered list:\n ' ,
149- b'1. Line 1 of item 1\n ' ,
150- b'Line 2 of item 1\n ' ,
151- b'Line 3 of item 1\n ' ,
152- b'2. Line 1 of item 2\n ' ,
153- b'Line 2 of item 2\n ' ,
154- b'Line 3 of item 2\n ' ,
155- b'\n ' ,
156- b'And now we are going to\n ' ,
157- b'see how a second page\n ' ,
158- b'sequence is handled.\n ' ,
159- b'\n ' ,
160- b'\x0c This is the page header\n ' ,
161- b'Apache FOP (Formatting\n ' ,
162- b'Objects Processor) is a\n ' ,
163- b'print formatter driven by\n ' ,
164- b'XSL \n ' ,
165- b'formatting objects\n ' ,
166- b'(XSL-FO) and an output\n ' ,
167- b'independent formatter1. It\n ' ,
168- b'is a Java application that\n ' ,
169- b'reads a formatting object\n ' ,
170- b'(FO) tree and renders the\n ' ,
171- b'\n ' ,
172- b'Page 4\n ' ,
173- b'resulting pages to a spe-\n ' ,
174- b'cified output.\n ' ,
175- b'\n ' ,
176- b'Header 1.1 Header 1.2\n ' ,
177- b'Cell 1.1\n ' ,
178- b'Cell 2.1\n ' ,
179- b'\n ' ,
180- b'Cell 1.2\n ' ,
181- b'Cell 2.2\n ' ,
182- b'\n ' ,
183- b'Apache FOP (Formatting\n ' ,
184- b'Objects Processor) est\n ' ,
185- b'une application de mise en\n ' ,
186- b'page de documents res-\n ' ,
187- b'pectant le standard XSL-\n ' ,
188- b'\n ' ,
189- b'See the FOP website for more information\n ' ,
190- b'\n ' ,
191- b'\x0c This is the page header\n ' ,
192- b'Page 5\n ' ,
193- b'FO. \xc3 \x80 partir d\xe2 \x80 \x99 un document\n ' ,
194- b'va effectue une mise en\n ' ,
195- b'au format XSL-FO, cette\n ' ,
196- b'page et renvoie un docu-\n ' ,
197- b'application \xc3 \xa9 crite en Ja-\n ' ,
198- b'ment pr\xc3 \xaa t pour impression.\n ' ,
199- b'This fo:block element spans all the columns of the docu-\n ' ,
200- b'ment. This is intended to test the abilities of the text-to-\n ' ,
201- b'speech program.\n ' ,
202- b'And now we are back to\n ' ,
203- b'normal content flowing in\n ' ,
204- b'two columns. Let\xe2 \x80 \x99 s start a\n ' ,
205- b'numbered list:\n ' ,
206- b'1. Line 1 of item 1\n ' ,
207- b'Line 2 of item 1\n ' ,
208- b'\n ' ,
209- b'Line 3 of item 1\n ' ,
210- b'2. Line 1 of item 2\n ' ,
211- b'Line 2 of item 2\n ' ,
212- b'Line 3 of item 2\n ' ,
213- b'\n ' ,
214- b'The end of the document\n ' ,
215- b'has now been reached.\n ' ,
216- b'\n ' ,
217- b'\x0c '
218- ]
219-
220- apache_fop_expected_2020 = [
221- b'This is the page header\n ' ,
222- b'\n ' ,
223- b'(There\xe2 \x80 \x99 s another page se-\n ' ,
224- b'quence below.)\n ' ,
225- b'\n ' ,
226- b'About Apache FOP\n ' ,
227- b'It is a print formatter driv-\n ' ,
228- b'en by XSL formatting ob-\n ' ,
229- b'jects (XSL-FO) and an out-\n ' ,
230- b'format-\n ' ,
231- b'put \n ' ,
232- b'\n ' ,
233- b'independent \n ' ,
234- b'\n ' ,
235- b'Page 1\n ' ,
236- b'ter1. FOP has a nice logo:\n ' ,
237- b'\n ' ,
238- b'Header 1.1 Header 1.2\n ' ,
239- b'\n ' ,
240- b'Cell 1.1\n ' ,
241- b'\n ' ,
242- b'Cell 1.2\n ' ,
243- b'\n ' ,
244- b'See the FOP website for more information\n ' ,
245- b'\n ' ,
246- b'\x0c This is the page header\n ' ,
247- b'\n ' ,
248- b'Page 2\n ' ,
249- b'\n ' ,
250- b'Cell 2.2\n ' ,
251- b'\n ' ,
252- b'Cell 2.1\n ' ,
253- b'\n ' ,
254- b'Header 1.1 Header 1.2\n ' ,
255- b'\n ' ,
256- b'(XSL-FO) and an output in-\n ' ,
257- b'dependent formatter. It is a\n ' ,
258- b'Java application that reads\n ' ,
259- b'a formatting object (FO)\n ' ,
260- b'tree and renders the res-\n ' ,
261- b'ulting pages to a specified\n ' ,
262- b'output.\n ' ,
263- b'\n ' ,
264- b'Apache FOP (Formatting\n ' ,
265- b'Objects Processor) is a\n ' ,
266- b'print formatter driven by\n ' ,
267- b'XSL \n ' ,
268- b'formatting objects\n ' ,
269- b'This fo:block element spans all the columns of the docu-\n ' ,
270- b'ment. This is intended to test the abilities of the text-to-\n ' ,
271- b'speech program.\n ' ,
272- b'And now we are back to\n ' ,
273- b'normal content flowing in\n ' ,
274- b'\n ' ,
275- b'\x0c Page 3\n ' ,
276- b'\n ' ,
277- b'This is the page header\n ' ,
278- b'\n ' ,
279- b'two columns. Let\xe2 \x80 \x99 s start a\n ' ,
280- b'numbered list:\n ' ,
281- b'1. Line 1 of item 1\n ' ,
282101 b'Line 2 of item 1\n ' ,
283- b'Line 3 of item 1\n ' ,
284- b'2. Line 1 of item 2\n ' ,
285- b'Line 2 of item 2\n ' ,
286- b'Line 3 of item 2\n ' ,
287- b'\n ' ,
288- b'And now we are going to\n ' ,
289- b'see how a second page\n ' ,
290- b'sequence is handled.\n ' ,
291- b'\n ' ,
292- b'\x0c This is the page header\n ' ,
293- b'\n ' ,
294- b'Apache FOP (Formatting\n ' ,
295- b'Objects Processor) is a\n ' ,
296- b'print formatter driven by\n ' ,
297- b'XSL \n ' ,
298- b'formatting objects\n ' ,
299- b'(XSL-FO) and an output\n ' ,
300- b'independent formatter1. It\n ' ,
301- b'is a Java application that\n ' ,
302- b'reads a formatting object\n ' ,
303- b'(FO) tree and renders the\n ' ,
304- b'\n ' ,
305- b'Page 4\n ' ,
306- b'\n ' ,
307- b'resulting pages to a spe-\n ' ,
308- b'cified output.\n ' ,
309- b'\n ' ,
310- b'Header 1.1 Header 1.2\n ' ,
311- b'\n ' ,
312- b'Cell 1.1\n ' ,
313- b'Cell 2.1\n ' ,
314- b'\n ' ,
315- b'Cell 1.2\n ' ,
316- b'Cell 2.2\n ' ,
317- b'\n ' ,
318102 b'Apache FOP (Formatting\n ' ,
319103 b'Objects Processor) est\n ' ,
320104 b'une application de mise en\n ' ,
321- b'page de documents res-\n ' ,
322- b'pectant le standard XSL-\n ' ,
323- b'\n ' ,
324- b'See the FOP website for more information\n ' ,
325- b'\n ' ,
326- b'\x0c This is the page header\n ' ,
327- b'\n ' ,
328- b'Page 5\n ' ,
329- b'\n ' ,
330- b'FO. \xc3 \x80 partir d\xe2 \x80 \x99 un document\n ' ,
331- b'va effectue une mise en\n ' ,
332- b'au format XSL-FO, cette\n ' ,
333- b'page et renvoie un docu-\n ' ,
334- b'ment pr\xc3 \xaa t pour impression.\n ' ,
335- b'application \xc3 \xa9 crite en Ja-\n ' ,
336- b'This fo:block element spans all the columns of the docu-\n ' ,
337- b'ment. This is intended to test the abilities of the text-to-\n ' ,
338- b'speech program.\n ' ,
339- b'And now we are back to\n ' ,
340- b'normal content flowing in\n ' ,
341- b'two columns. Let\xe2 \x80 \x99 s start a\n ' ,
342- b'numbered list:\n ' ,
343- b'1. Line 1 of item 1\n ' ,
344- b'Line 2 of item 1\n ' ,
345- b'\n ' ,
346- b'Line 3 of item 1\n ' ,
347- b'2. Line 1 of item 2\n ' ,
348- b'Line 2 of item 2\n ' ,
349- b'Line 3 of item 2\n ' ,
350- b'\n ' ,
351105 b'The end of the document\n ' ,
352106 b'has now been reached.\n ' ,
353- b'\n ' ,
354- b'\x0c ' ,
355107]
0 commit comments