@@ -414,3 +414,45 @@ def test_rotated_layout_mode(caplog):
414414 assert not caplog .records , "No warnings should be issued"
415415 assert text , "Text matching the page rotation should be extracted"
416416 assert re .search (r"\r?\n +69\r?\n +UNCLASSIFIED$" , text ), "Contents should be in expected layout"
417+
418+
419+ @pytest .mark .enable_socket
420+ @pytest .mark .filterwarnings ("ignore::pypdf.errors.PdfReadWarning" )
421+ def test_extract_text__none_objects ():
422+ url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf"
423+ name = "tika-957721.pdf"
424+ reader = PdfReader (BytesIO (get_data_from_url (url , name = name )))
425+
426+ reader .pages [0 ].extract_text ()
427+ reader .pages [8 ].extract_text ()
428+
429+
430+ @pytest .mark .enable_socket
431+ def test_extract_text__with_visitor_text ():
432+ def visitor_text (* args , ** kwargs ): # noqa: ANN002, ANN003, ANN202
433+ pass
434+
435+ url = "https://github.com/user-attachments/files/18381718/tika-952016.pdf"
436+ name = "tika-952016.pdf"
437+ stream = BytesIO (get_data_from_url (url , name = name ))
438+ reader = PdfReader (stream )
439+ page = reader .pages [0 ]
440+ page .extract_text (visitor_text = visitor_text )
441+
442+ reader = PdfReader (BytesIO (get_data_from_url (name = "TextAttack_paper.pdf" )))
443+ page = reader .pages [0 ]
444+ page .extract_text (visitor_text = visitor_text )
445+
446+
447+ @pytest .mark .enable_socket
448+ def test_extract_text__restore_cm_stack_pop_error ():
449+ url = "https://github.com/user-attachments/files/18381737/tika-966635.pdf"
450+ name = "tika-966635.pdf"
451+ stream = BytesIO (get_data_from_url (url , name = name ))
452+ reader = PdfReader (stream )
453+ page = reader .pages [10 ]
454+
455+ # There is a previous error we already omit ("pop from empty list"), thus
456+ # check for the message explicitly here.
457+ with pytest .raises (IndexError , match = "list index out of range" ):
458+ page .extract_text ()
0 commit comments