5151
5252from ._cmap import (
5353 build_char_map ,
54- unknown_char_map ,
5554)
5655from ._protocols import PdfCommonDocProtocol
5756from ._text_extraction import (
58- OrientationNotFoundError ,
5957 _layout_mode ,
60- crlf_space_check ,
61- mult ,
6258)
6359from ._text_extraction ._text_extractor import TextExtraction
6460from ._utils import (
@@ -1657,7 +1653,7 @@ def _debug_for_extract(self) -> str: # pragma: no cover
16571653 out += "No Font\n "
16581654 return out
16591655
1660- def _extract_text ( # noqa: C901, PLR0915 # Will be fixed soon.
1656+ def _extract_text (
16611657 self ,
16621658 obj : Any ,
16631659 pdf : Any ,
@@ -1678,9 +1674,6 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
16781674
16791675 """
16801676 extractor = TextExtraction ()
1681- text : str = ""
1682- output : str = ""
1683- rtl_dir : bool = False # right-to-left
16841677 cmaps : Dict [
16851678 str ,
16861679 Tuple [
@@ -1707,14 +1700,6 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
17071700 cmaps [f ] = build_char_map (f , space_width , obj )
17081701 except TypeError :
17091702 pass
1710- cmap : Tuple [
1711- Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
1712- ] = (
1713- "charmap" ,
1714- {},
1715- "NotInitialized" ,
1716- None ,
1717- ) # (encoding, CMAP, font resource name, font)
17181703
17191704 try :
17201705 content = (
@@ -1728,245 +1713,57 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
17281713 # are strings where the byte->string encoding was unknown, so adding
17291714 # them to the text here would be gibberish.
17301715
1731- cm_matrix : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1732- tm_matrix : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1733- cm_stack = []
1734-
1735- # Store the last modified matrices; can be an intermediate position
1736- cm_prev : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1737- tm_prev : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1738-
1739- # Store the position at the beginning of building the text
1740- memo_cm : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1741- memo_tm : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1742-
1743- char_scale = 1.0
1744- space_scale = 1.0
1745- _space_width : float = 500.0 # will be set correctly at first Tf
1746- _actual_str_size : Dict [str , float ] = {
1747- "str_widths" : 0.0 , "space_width" : 0.0 , "str_height" : 0.0
1748- } # will be set to string length calculation result
1749- TL = 0.0
1750- font_size = 12.0 # init just in case of
1751-
1752- def compute_str_widths (str_widths : float ) -> float :
1753- return str_widths / 1000
1754-
1755- def process_operation (operator : bytes , operands : List [Any ]) -> None :
1756- nonlocal cm_matrix , tm_matrix , cm_stack , cm_prev , tm_prev , memo_cm , memo_tm
1757- nonlocal char_scale , space_scale , _space_width , TL , font_size , cmap
1758- nonlocal orientations , rtl_dir , visitor_text , output , text , _actual_str_size
1759-
1760- str_widths : float = 0.0
1761-
1762- # Table 5.4 page 405
1763- if operator == b"BT" : # Begin Text
1764- tm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1765- # Flush text:
1766- output += text
1767- if visitor_text is not None :
1768- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1769- text = ""
1770- memo_cm = cm_matrix .copy ()
1771- memo_tm = tm_matrix .copy ()
1772- return
1773- if operator == b"ET" : # End Text
1774- # Flush text:
1775- output += text
1776- if visitor_text is not None :
1777- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1778- text = ""
1779- memo_cm = cm_matrix .copy ()
1780- memo_tm = tm_matrix .copy ()
1781-
1782- # Table 4.7 "Graphics state operators", page 219
1783- # cm_matrix calculation is reserved for later
1784- elif operator == b"q" : # Save graphics state
1785- cm_stack .append (
1786- (
1787- cm_matrix ,
1788- cmap ,
1789- font_size ,
1790- char_scale ,
1791- space_scale ,
1792- _space_width ,
1793- TL ,
1794- )
1795- )
1796- elif operator == b"Q" : # Restore graphics state
1797- try :
1798- (
1799- cm_matrix ,
1800- cmap ,
1801- font_size ,
1802- char_scale ,
1803- space_scale ,
1804- _space_width ,
1805- TL ,
1806- ) = cm_stack .pop ()
1807- except Exception :
1808- cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1809- elif operator == b"cm" : # Modify current matrix
1810- output += text
1811- if visitor_text is not None :
1812- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1813- text = ""
1814- try :
1815- cm_matrix = mult (
1816- [float (operand ) for operand in operands [:6 ]],
1817- cm_matrix
1818- )
1819- except Exception :
1820- cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1821- memo_cm = cm_matrix .copy ()
1822- memo_tm = tm_matrix .copy ()
1823-
1824- # Table 5.2 page 398
1825- elif operator == b"Tz" : # Set horizontal text scaling
1826- char_scale = float (operands [0 ]) / 100 if operands else 1.0
1827- elif operator == b"Tw" : # Set word spacing
1828- space_scale = 1.0 + float (operands [0 ] if operands else 0.0 )
1829- elif operator == b"TL" : # Set Text Leading
1830- scale_x = math .sqrt (tm_matrix [0 ]** 2 + tm_matrix [2 ]** 2 )
1831- TL = float (operands [0 ] if operands else 0.0 ) * font_size * scale_x
1832- elif operator == b"Tf" : # Set font size
1833- if text != "" :
1834- output += text # .translate(cmap)
1835- if visitor_text is not None :
1836- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1837- text = ""
1838- memo_cm = cm_matrix .copy ()
1839- memo_tm = tm_matrix .copy ()
1840- try :
1841- # char_map_tuple: font_type,
1842- # float(sp_width / 2),
1843- # encoding,
1844- # map_dict,
1845- # font_dict (describes the font)
1846- char_map_tuple = cmaps [operands [0 ]]
1847- # current cmap: encoding,
1848- # map_dict,
1849- # font resource name (internal name, not the real font name),
1850- # font_dict
1851- cmap = (
1852- char_map_tuple [2 ],
1853- char_map_tuple [3 ],
1854- operands [0 ],
1855- char_map_tuple [4 ],
1856- )
1857- _space_width = char_map_tuple [1 ]
1858- except KeyError : # font not found
1859- cmap = (
1860- unknown_char_map [2 ],
1861- unknown_char_map [3 ],
1862- f"???{ operands [0 ]} " ,
1863- None ,
1864- )
1865- _space_width = unknown_char_map [1 ]
1866- try :
1867- font_size = float (operands [1 ])
1868- except Exception :
1869- pass # keep previous size
1870- # Table 5.5 page 406
1871- elif operator == b"Td" : # Move text position
1872- # A special case is a translating only tm:
1873- # tm = [1, 0, 0, 1, e, f]
1874- # i.e. tm[4] += tx, tm[5] += ty.
1875- tx , ty = float (operands [0 ]), float (operands [1 ])
1876- tm_matrix [4 ] += tx * tm_matrix [0 ] + ty * tm_matrix [2 ]
1877- tm_matrix [5 ] += tx * tm_matrix [1 ] + ty * tm_matrix [3 ]
1878- str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
1879- _actual_str_size ["str_widths" ] = 0.0
1880- elif operator == b"Tm" : # Set text matrix
1881- tm_matrix = [float (operand ) for operand in operands [:6 ]]
1882- str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
1883- _actual_str_size ["str_widths" ] = 0.0
1884- elif operator == b"T*" : # Move to next line
1885- tm_matrix [4 ] -= TL * tm_matrix [2 ]
1886- tm_matrix [5 ] -= TL * tm_matrix [3 ]
1887- str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
1888- _actual_str_size ["str_widths" ] = 0.0
1889- elif operator == b"Tj" : # Show text
1890- text , rtl_dir , _actual_str_size = extractor ._handle_tj (
1891- text ,
1892- operands ,
1893- cm_matrix ,
1894- tm_matrix ,
1895- cmap ,
1896- orientations ,
1897- font_size ,
1898- rtl_dir ,
1899- visitor_text ,
1900- _space_width ,
1901- _actual_str_size ,
1902- )
1903- else :
1904- return
1905-
1906- if operator in {b"Td" , b"Tm" , b"T*" , b"Tj" }:
1907- try :
1908- text , output , cm_prev , tm_prev = crlf_space_check (
1909- text ,
1910- (cm_prev , tm_prev ),
1911- (cm_matrix , tm_matrix ),
1912- (memo_cm , memo_tm ),
1913- cmap ,
1914- orientations ,
1915- output ,
1916- font_size ,
1917- visitor_text ,
1918- str_widths ,
1919- compute_str_widths (_actual_str_size ["space_width" ]),
1920- _actual_str_size ["str_height" ]
1921- )
1922- if text == "" :
1923- memo_cm = cm_matrix .copy ()
1924- memo_tm = tm_matrix .copy ()
1925- except OrientationNotFoundError :
1926- return
1716+ # Initialize the extractor with the necessary parameters
1717+ extractor .initialize_extraction (orientations , visitor_text , cmaps )
19271718
19281719 for operands , operator in content .operations :
19291720 if visitor_operand_before is not None :
1930- visitor_operand_before (operator , operands , cm_matrix , tm_matrix )
1721+ visitor_operand_before (operator , operands , extractor . cm_matrix , extractor . tm_matrix )
19311722 # Multiple operators are handled here
19321723 if operator == b"'" :
1933- process_operation (b"T*" , [])
1934- process_operation (b"Tj" , operands )
1724+ extractor . process_operation (b"T*" , [])
1725+ extractor . process_operation (b"Tj" , operands )
19351726 elif operator == b'"' :
1936- process_operation (b"Tw" , [operands [0 ]])
1937- process_operation (b"Tc" , [operands [1 ]])
1938- process_operation (b"T*" , [])
1939- process_operation (b"Tj" , operands [2 :])
1727+ extractor . process_operation (b"Tw" , [operands [0 ]])
1728+ extractor . process_operation (b"Tc" , [operands [1 ]])
1729+ extractor . process_operation (b"T*" , [])
1730+ extractor . process_operation (b"Tj" , operands [2 :])
19401731 elif operator == b"TJ" :
19411732 # The space width may be smaller than the font width, so the width should be 95%.
1942- _confirm_space_width = _space_width * 0.95
1733+ _confirm_space_width = extractor . _space_width * 0.95
19431734 if operands :
19441735 for op in operands [0 ]:
19451736 if isinstance (op , (str , bytes )):
1946- process_operation (b"Tj" , [op ])
1737+ extractor . process_operation (b"Tj" , [op ])
19471738 if isinstance (op , (int , float , NumberObject , FloatObject )) and (
19481739 abs (float (op )) >= _confirm_space_width
1949- and text
1950- and text [- 1 ] != " "
1740+ and extractor . text
1741+ and extractor . text [- 1 ] != " "
19511742 ):
1952- process_operation (b"Tj" , [" " ])
1743+ extractor . process_operation (b"Tj" , [" " ])
19531744 elif operator == b"TD" :
1954- process_operation (b"TL" , [- operands [1 ]])
1955- process_operation (b"Td" , operands )
1745+ extractor . process_operation (b"TL" , [- operands [1 ]])
1746+ extractor . process_operation (b"Td" , operands )
19561747 elif operator == b"Do" :
1957- output += text
1748+ extractor . output += extractor . text
19581749 if visitor_text is not None :
1959- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1750+ visitor_text (
1751+ extractor .text ,
1752+ extractor .memo_cm ,
1753+ extractor .memo_tm ,
1754+ extractor .cmap [3 ],
1755+ extractor .font_size ,
1756+ )
19601757 try :
1961- if output [- 1 ] != "\n " :
1962- output += "\n "
1758+ if extractor . output [- 1 ] != "\n " :
1759+ extractor . output += "\n "
19631760 if visitor_text is not None :
19641761 visitor_text (
19651762 "\n " ,
1966- memo_cm ,
1967- memo_tm ,
1968- cmap [3 ],
1969- font_size ,
1763+ extractor . memo_cm ,
1764+ extractor . memo_tm ,
1765+ extractor . cmap [3 ],
1766+ extractor . font_size ,
19701767 )
19711768 except IndexError :
19721769 pass
@@ -1981,32 +1778,38 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19811778 visitor_operand_after ,
19821779 visitor_text ,
19831780 )
1984- output += text
1781+ extractor . output += text
19851782 if visitor_text is not None :
19861783 visitor_text (
19871784 text ,
1988- memo_cm ,
1989- memo_tm ,
1990- cmap [3 ],
1991- font_size ,
1785+ extractor . memo_cm ,
1786+ extractor . memo_tm ,
1787+ extractor . cmap [3 ],
1788+ extractor . font_size ,
19921789 )
19931790 except Exception as exception :
19941791 logger_warning (
19951792 f"Impossible to decode XFormObject { operands [0 ]} : { exception } " ,
19961793 __name__ ,
19971794 )
19981795 finally :
1999- text = ""
2000- memo_cm = cm_matrix .copy ()
2001- memo_tm = tm_matrix .copy ()
1796+ extractor . text = ""
1797+ extractor . memo_cm = extractor . cm_matrix .copy ()
1798+ extractor . memo_tm = extractor . tm_matrix .copy ()
20021799 else :
2003- process_operation (operator , operands )
1800+ extractor . process_operation (operator , operands )
20041801 if visitor_operand_after is not None :
2005- visitor_operand_after (operator , operands , cm_matrix , tm_matrix )
2006- output += text # just in case
2007- if text != "" and visitor_text is not None :
2008- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
2009- return output
1802+ visitor_operand_after (operator , operands , extractor .cm_matrix , extractor .tm_matrix )
1803+ extractor .output += extractor .text # just in case
1804+ if extractor .text != "" and visitor_text is not None :
1805+ visitor_text (
1806+ extractor .text ,
1807+ extractor .memo_cm ,
1808+ extractor .memo_tm ,
1809+ extractor .cmap [3 ],
1810+ extractor .font_size ,
1811+ )
1812+ return extractor .output
20101813
20111814 def _layout_mode_fonts (self ) -> Dict [str , _layout_mode .Font ]:
20121815 """
0 commit comments