diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index ed385bb3d..f7346fd44 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1310,14 +1310,42 @@ def _extract_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ See extract_text for most arguments. - Args: - content_key: indicate the default key where to extract data - None = the object; this allow to reuse the function on XObject - default = "/Content" + Arabic, Hebrew,... are extracted in the good order. If required an custom RTL range of characters + can be defined; see function set_custom_rtl + + Additionally you can provide visitor-methods to get informed on all operands and all text-objects. + For example in some PDF files this can be useful to parse tables. + + :param Tuple[int, ...] orientations: list of orientations text_extraction will look for + default = (0, 90, 180, 270) + note: currently only 0(Up),90(turned Left), 180(upside Down), 270 (turned Right) + :param float space_width: force default space width + (if not extracted from font (default 200) + :param Optional[str] content_key: indicate the default key where to extract data + None = the object; this allow to reuse the function on XObject + default = "/Content" + :param Optional[Function] visitor_operand_before: function to be called before processing an operand. + It has four arguments: operand, operand-arguments, + current transformation matrix and text matrix. + :param Optional[Function] visitor_operand_after: function to be called after processing an operand. + It has four arguments: operand, operand-arguments, + current transformation matrix and text matrix. + :param Optional[Function] visitor_text: function to be called when extracting some text at some position. + It has five arguments: text, + current transformation matrix, text matrix, font-dictionary and font-size. + The font-dictionary may be None in case of unknown fonts. + If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". + :param Optional[bool] group_TJ: True for one call of visitor_text at each TJ, + False for calls of visitor_text at each text-fragment of TJ. + :param content_key: indicate the default key where to extract data + None = the object; this allow to reuse the function on XObject + default = "/Content" + :return: a string object. """ text: str = "" output: str = "" @@ -1411,8 +1439,6 @@ def process_operation(operator: bytes, operands: List) -> None: tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] # tm_prev = tm_matrix output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) # based # if output != "" and output[-1]!="\n": # output += "\n" @@ -1420,8 +1446,6 @@ def process_operation(operator: bytes, operands: List) -> None: return None elif operator == b"ET": output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" # table 4.7 "Graphics state operators", page 219 # cm_matrix calculation is a reserved for the moment @@ -1453,8 +1477,6 @@ def process_operation(operator: bytes, operands: List) -> None: # rtl_dir = False elif operator == b"cm": output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" cm_matrix = mult( [ @@ -1478,8 +1500,6 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"Tf": if text != "": output += text # .translate(cmap) - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" # rtl_dir = False try: @@ -1537,6 +1557,10 @@ def process_operation(operator: bytes, operands: List) -> None: if orientation in orientations: if isinstance(operands[0], str): text += operands[0] + if visitor_text is not None: + visitor_text( + operands[0], cm_matrix, tm_matrix, cmap[3], font_size + ) else: t: str = "" tt: bytes = ( @@ -1562,6 +1586,7 @@ def process_operation(operator: bytes, operands: List) -> None: ] ) # "\u0590 - \u08FF \uFB50 - \uFDFF" + tj_text = "" for x in "".join( [cmap[1][x] if x in cmap[1] else x for x in t] ): @@ -1574,7 +1599,7 @@ def process_operation(operator: bytes, operands: List) -> None: or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): - text = x + text if rtl_dir else text + x + tj_text = x + tj_text if rtl_dir else tj_text + x elif ( # right-to-left characters set (0x0590 <= xx and xx <= 0x08FF) or (0xFB1D <= xx and xx <= 0xFDFF) @@ -1586,21 +1611,22 @@ def process_operation(operator: bytes, operands: List) -> None: rtl_dir = True # print("RTL",text,"*") output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" - text = x + text + tj_text = x + tj_text else: # left-to-right # print(">",xx,x,end="") if rtl_dir: rtl_dir = False # print("LTR",text,"*") output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" - text = text + x + tj_text = tj_text + x # fmt: on + text = tj_text + text if rtl_dir else text + tj_text + if visitor_text is not None: + visitor_text( + tj_text, cm_matrix, tm_matrix, cmap[3], font_size + ) else: return None if check_crlf_space: @@ -1620,7 +1646,7 @@ def process_operation(operator: bytes, operands: List) -> None: output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", cm_matrix, tm_matrix, cmap[3], @@ -1633,13 +1659,21 @@ def process_operation(operator: bytes, operands: List) -> None: ): if (output + text)[-1] != " ": text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) elif orientation == 180: if delta_y > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", cm_matrix, tm_matrix, cmap[3], @@ -1652,13 +1686,21 @@ def process_operation(operator: bytes, operands: List) -> None: ): if (output + text)[-1] != " ": text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) elif orientation == 90: if delta_x > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", cm_matrix, tm_matrix, cmap[3], @@ -1671,13 +1713,21 @@ def process_operation(operator: bytes, operands: List) -> None: ): if (output + text)[-1] != " ": text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) elif orientation == 270: if delta_x < -0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", cm_matrix, tm_matrix, cmap[3], @@ -1690,6 +1740,14 @@ def process_operation(operator: bytes, operands: List) -> None: ): if (output + text)[-1] != " ": text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) except Exception: pass @@ -1709,6 +1767,28 @@ def process_operation(operator: bytes, operands: List) -> None: process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": + if visitor_text is not None and group_TJ: + # To prevent sending letters instead of words we + # override the visitor temporarily. + visitor_text_before = visitor_text + tm_matrix_before = [ + tm_matrix[0], + tm_matrix[1], + tm_matrix[2], + tm_matrix[3], + tm_matrix[4], + tm_matrix[5], + ] + text_TJ = [] + + def visitor_text(text, cm_matrix, tm_matrix, font_dict, font_size): + # TODO cases where the current inserting order is kept + if rtl_dir: + # right-to-left + text_TJ.insert(0, text) + else: + text_TJ.append(text) + for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) @@ -1719,10 +1799,17 @@ def process_operation(operator: bytes, operands: List) -> None: and (text[-1] != " ") ): process_operation(b"Tj", [" "]) + if visitor_text is not None and group_TJ: + visitor_text = visitor_text_before + visitor_text( + "".join(text_TJ), + cm_matrix, + tm_matrix_before, + cmap[3], + font_size, + ) elif operator == b"Do": output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" @@ -1741,10 +1828,9 @@ def process_operation(operator: bytes, operands: List) -> None: visitor_operand_before, visitor_operand_after, visitor_text, + group_TJ, ) output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", @@ -1757,8 +1843,6 @@ def process_operation(operator: bytes, operands: List) -> None: if visitor_operand_after is not None: visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of - if text != "" and visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) return output def extract_text( @@ -1771,6 +1855,7 @@ def extract_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ Locate all text drawing commands, in the order they are provided in the @@ -1804,10 +1889,12 @@ def extract_text( It has four arguments: operand, operand-arguments, current transformation matrix and text matrix. visitor_text: function to be called when extracting some text at some position. - It has five arguments: text, current transformation matrix, - text matrix, font-dictionary and font-size. + It has five arguments: text, + current transformation matrix, text matrix, font-dictionary and font-size. The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". + group_TJ: True for one call of visitor_text at each TJ, + False for calls of visitor_text at each text-fragment of TJ. Returns: The extracted text @@ -1857,6 +1944,7 @@ def extract_text( visitor_operand_before, visitor_operand_after, visitor_text, + group_TJ, ) def extract_xform_text( @@ -1867,12 +1955,15 @@ def extract_xform_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ Extract text from an XObject. Args: space_width: force default space width (if not extracted from font (default 200) + group_TJ: True for one call of visitor_text at each TJ, + False for calls of visitor_text at each text-fragment of TJ. Returns: The extracted text diff --git a/tests/test_page.py b/tests/test_page.py index ee4af14a0..98ec7948b 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1,5 +1,6 @@ import json import os +import re from copy import deepcopy from io import BytesIO from pathlib import Path @@ -455,7 +456,7 @@ def print_op_b(op, args, cm_matrix, tm_matrix): rectangles.append(r) def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size): - if text.strip() != "": + if text != "": if logger.isEnabledFor(logging.DEBUG): logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}") texts.append( @@ -481,7 +482,7 @@ def extract_table( It is expected that each cell is marked by a rectangle-object. It is expected that the page contains one table only. - It is expected that the table contains at least 3 columns and 2 rows. + It is expected that the table contains at least 2 columns and 2 rows. A list of rows is returned. Each row contains a list of cells. @@ -533,8 +534,8 @@ def extract_table( curr_y = None curr_row = None for r in rectangles_filtered: - if col2count[r.x] < 3 or row2count[r.y] < 2: - # We expect at least 3 columns and 2 rows. + if col2count[r.x] < 2 or row2count[r.y] < 2: + # We expect at least 2 columns and 2 rows. continue if curr_y is None or r.y != curr_y: # next row @@ -556,7 +557,8 @@ def extract_table( def extract_cell_text(cell_texts: List[PositionedText]) -> str: """Joins the text-objects of a cell.""" - return ("".join(t.text for t in cell_texts)).strip() + text_raw = "".join(t.text for t in cell_texts) + return re.sub(r" +\n", "\n", text_raw.strip()) # Test 1: We test the analysis of page 7 "2.1 LRS model". reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") @@ -576,12 +578,16 @@ def ignore_large_rectangles(r): for t in texts: for r in rectangles: if r.contains(t.x, t.y): - texts = rectangle2texts.setdefault(r, []) - texts.append(t.text.strip()) + rtexts = rectangle2texts.setdefault(r, []) + if t.text != "": + rtexts.append(t.text) break # Five boxes and the figure-description below. - assert len(rectangle2texts) == 6 - box_texts = [" ".join(texts) for texts in rectangle2texts.values()] + assert len(rectangle2texts) == 11 + box_texts = [ + re.sub(" *\n", " ", "".join(texts).strip()) + for texts in rectangle2texts.values() + ] assert "Hydro Network" in box_texts assert "Hydro Events" in box_texts assert "Metadata" in box_texts @@ -606,10 +612,10 @@ def filter_first_table(r): assert extract_cell_text(rows[0][2]) == "Description" assert extract_cell_text(rows[1][0]) == "September 2002" # The line break between "English review;" - # and "Remove" is not detected. + # and "Remove" is detected. assert ( extract_cell_text(rows[6][2]) - == "English review;Remove the UML model for the Segmented view." + == "English review;\nRemove the UML model for the Segmented view." ) assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments." @@ -647,6 +653,16 @@ def visitor_td(op, args, cm, tm): assert list_Td[2] == (210.0, 210.0) assert list_Td[3] == (410.0, 210.0) + # Test 3b: check extract_visitor in Sample_Td-matrix.pdf + # + (texts, rectangles) = extract_text_and_rectangles(page_td_model) + rows = extract_table(texts, rectangles) + assert len(rows) == 2 + assert extract_cell_text(rows[0][0]) == "Hello PDF!" + assert extract_cell_text(rows[0][1]) == "Hello PDF 200 0 Td!" + assert extract_cell_text(rows[1][0]) == "Hello PDF 2 1!" + assert extract_cell_text(rows[1][1]) == "Hello PDF 10 7!" + @pytest.mark.parametrize( ("pdf_path", "password", "embedded", "unembedded"),