1+ # Copyright (c) 2006, Mathieu Fenniak
2+ # Copyright (c) 2007, Ashish Kulkarni <[email protected] > 3+ #
4+ # All rights reserved.
5+ #
6+ # Redistribution and use in source and binary forms, with or without
7+ # modification, are permitted provided that the following conditions are
8+ # met:
9+ #
10+ # * Redistributions of source code must retain the above copyright notice,
11+ # this list of conditions and the following disclaimer.
12+ # * Redistributions in binary form must reproduce the above copyright notice,
13+ # this list of conditions and the following disclaimer in the documentation
14+ # and/or other materials provided with the distribution.
15+ # * The name of the author may not be used to endorse or promote products
16+ # derived from this software without specific prior written permission.
17+ #
18+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28+ # POSSIBILITY OF SUCH DAMAGE.
29+
30+ from typing import Tuple , Union , Dict , Optional , Any , List , Callable
31+
32+ from .._cmap import get_actual_str_key , build_font_width_map , compute_font_width
33+ from ..generic import DictionaryObject , TextStringObject
34+ from . import get_text_operands , get_display_str
35+
36+ class TextExtraction :
37+ """
38+ A class to handle PDF text extraction operations.
39+
40+ This class encapsulates all the state and operations needed for extracting
41+ text from PDF content streams, replacing the nested functions and nonlocal
42+ variables in the original implementation.
43+ """
44+
45+ def __init__ (self ):
46+ self ._font_width_maps : Dict [str , Tuple [Dict [Any , float ], str , float ]] = {}
47+
48+ def _get_actual_font_widths (
49+ self ,
50+ cmap : Tuple [
51+ Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
52+ ],
53+ text_operands : str ,
54+ font_size : float ,
55+ space_width : float
56+ ) -> Tuple [float , float , float ]:
57+ font_widths : float = 0
58+ font_name : str = cmap [2 ]
59+ if font_name not in self ._font_width_maps :
60+ if cmap [3 ] is None :
61+ font_width_map : Dict [Any , float ] = {}
62+ space_char = " "
63+ actual_space_width : float = space_width
64+ font_width_map ["default" ] = actual_space_width * 2
65+ else :
66+ space_char = get_actual_str_key (" " , cmap [0 ], cmap [1 ])
67+ font_width_map = build_font_width_map (cmap [3 ], space_width * 2 )
68+ actual_space_width = compute_font_width (font_width_map , space_char )
69+ if actual_space_width == 0 :
70+ actual_space_width = space_width
71+ self ._font_width_maps [font_name ] = (font_width_map , space_char , actual_space_width )
72+ font_width_map = self ._font_width_maps [font_name ][0 ]
73+ space_char = self ._font_width_maps [font_name ][1 ]
74+ actual_space_width = self ._font_width_maps [font_name ][2 ]
75+
76+ if text_operands :
77+ for char in text_operands :
78+ if char == space_char :
79+ font_widths += actual_space_width
80+ continue
81+ font_widths += compute_font_width (font_width_map , char )
82+ return (font_widths * font_size , space_width * font_size , font_size )
83+
84+ def _handle_tj (
85+ self ,
86+ text : str ,
87+ operands : List [Union [str , TextStringObject ]],
88+ cm_matrix : List [float ],
89+ tm_matrix : List [float ],
90+ cmap : Tuple [
91+ Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
92+ ],
93+ orientations : Tuple [int , ...],
94+ font_size : float ,
95+ rtl_dir : bool ,
96+ visitor_text : Optional [Callable [[Any , Any , Any , Any , Any ], None ]],
97+ space_width : float ,
98+ actual_str_size : Dict [str , float ]
99+ ) -> Tuple [str , bool , Dict [str , float ]]:
100+ text_operands , is_str_operands = get_text_operands (
101+ operands , cm_matrix , tm_matrix , cmap , orientations )
102+ if is_str_operands :
103+ text += text_operands
104+ else :
105+ text , rtl_dir = get_display_str (
106+ text ,
107+ cm_matrix ,
108+ tm_matrix , # text matrix
109+ cmap ,
110+ text_operands ,
111+ font_size ,
112+ rtl_dir ,
113+ visitor_text )
114+ font_widths , actual_str_size ["space_width" ], actual_str_size ["str_height" ] = (
115+ self ._get_actual_font_widths (cmap , text_operands , font_size , space_width ))
116+ actual_str_size ["str_widths" ] += font_widths
117+
118+ return text , rtl_dir , actual_str_size
0 commit comments