@@ -95,6 +95,22 @@ def __init__(self) -> None:
9595 self .visitor_text : Optional [Callable [[Any , Any , Any , Any , Any ], None ]] = None
9696 self .cmaps : Dict [str , Tuple [str , float , Union [str , Dict [int , str ]], Dict [str , str ], DictionaryObject ]] = {}
9797
98+ self .operation_handlers = {
99+ b"BT" : self ._handle_bt ,
100+ b"ET" : self ._handle_et ,
101+ b"q" : self ._handle_save_graphics_state ,
102+ b"Q" : self ._handle_restore_graphics_state ,
103+ b"cm" : self ._handle_cm ,
104+ b"Tz" : self ._handle_tz ,
105+ b"Tw" : self ._handle_tw ,
106+ b"TL" : self ._handle_tl ,
107+ b"Tf" : self ._handle_tf ,
108+ b"Td" : self ._handle_td ,
109+ b"Tm" : self ._handle_tm ,
110+ b"T*" : self ._handle_t_star ,
111+ b"Tj" : self ._handle_tj_operation ,
112+ }
113+
98114 def initialize_extraction (
99115 self ,
100116 orientations : Tuple [int , ...] = (0 , 90 , 180 , 270 ),
@@ -117,173 +133,36 @@ def compute_str_widths(self, str_widths: float) -> float:
117133 return str_widths / 1000
118134
119135 def process_operation (self , operator : bytes , operands : List [Any ]) -> None :
120- str_widths : float = 0.0
136+ if operator in self .operation_handlers :
137+ handler = self .operation_handlers [operator ]
138+ str_widths = handler (operands )
121139
122- # Table 5.4 page 405
123- if operator == b"BT" : # Begin Text
124- self .tm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
125- # Flush text:
126- self .output += self .text
127- if self .visitor_text is not None :
128- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
129- self .text = ""
130- self .memo_cm = self .cm_matrix .copy ()
131- self .memo_tm = self .tm_matrix .copy ()
132- return
133- if operator == b"ET" : # End Text
134- # Flush text:
135- self .output += self .text
136- if self .visitor_text is not None :
137- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
138- self .text = ""
139- self .memo_cm = self .cm_matrix .copy ()
140- self .memo_tm = self .tm_matrix .copy ()
141-
142- # Table 4.7 "Graphics state operators", page 219
143- # cm_matrix calculation is reserved for later
144- elif operator == b"q" : # Save graphics state
145- self .cm_stack .append (
146- (
147- self .cm_matrix ,
148- self .cmap ,
149- self .font_size ,
150- self .char_scale ,
151- self .space_scale ,
152- self ._space_width ,
153- self .TL ,
154- )
155- )
156- elif operator == b"Q" : # Restore graphics state
157- try :
158- (
159- self .cm_matrix ,
160- self .cmap ,
161- self .font_size ,
162- self .char_scale ,
163- self .space_scale ,
164- self ._space_width ,
165- self .TL ,
166- ) = self .cm_stack .pop ()
167- except Exception :
168- self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
169- elif operator == b"cm" : # Modify current matrix
170- self .output += self .text
171- if self .visitor_text is not None :
172- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
173- self .text = ""
174- try :
175- self .cm_matrix = mult ([float (operand ) for operand in operands [:6 ]], self .cm_matrix )
176- except Exception :
177- self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
178- self .memo_cm = self .cm_matrix .copy ()
179- self .memo_tm = self .tm_matrix .copy ()
180-
181- # Table 5.2 page 398
182- elif operator == b"Tz" : # Set horizontal text scaling
183- self .char_scale = float (operands [0 ]) / 100 if operands else 1.0
184- elif operator == b"Tw" : # Set word spacing
185- self .space_scale = 1.0 + float (operands [0 ] if operands else 0.0 )
186- elif operator == b"TL" : # Set Text Leading
187- scale_x = math .sqrt (self .tm_matrix [0 ] ** 2 + self .tm_matrix [2 ] ** 2 )
188- self .TL = float (operands [0 ] if operands else 0.0 ) * self .font_size * scale_x
189- elif operator == b"Tf" : # Set font size
190- if self .text != "" :
191- self .output += self .text # .translate(cmap)
192- if self .visitor_text is not None :
193- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
194- self .text = ""
195- self .memo_cm = self .cm_matrix .copy ()
196- self .memo_tm = self .tm_matrix .copy ()
197- try :
198- # Import here to avoid circular imports
199- from .._cmap import unknown_char_map # noqa: PLC0415
200-
201- # char_map_tuple: font_type,
202- # float(sp_width / 2),
203- # encoding,
204- # map_dict,
205- # font_dict (describes the font)
206- char_map_tuple = self .cmaps [operands [0 ]]
207- # current cmap: encoding,
208- # map_dict,
209- # font resource name (internal name, not the real font name),
210- # font_dict
211- self .cmap = (
212- char_map_tuple [2 ],
213- char_map_tuple [3 ],
214- operands [0 ],
215- char_map_tuple [4 ],
216- )
217- self ._space_width = char_map_tuple [1 ]
218- except KeyError : # font not found
219- self .cmap = (
220- unknown_char_map [2 ],
221- unknown_char_map [3 ],
222- f"???{ operands [0 ]} " ,
223- None ,
224- )
225- self ._space_width = unknown_char_map [1 ]
226- try :
227- self .font_size = float (operands [1 ])
228- except Exception :
229- pass # keep previous size
230- # Table 5.5 page 406
231- elif operator == b"Td" : # Move text position
232- # A special case is a translating only tm:
233- # tm = [1, 0, 0, 1, e, f]
234- # i.e. tm[4] += tx, tm[5] += ty.
235- tx , ty = float (operands [0 ]), float (operands [1 ])
236- self .tm_matrix [4 ] += tx * self .tm_matrix [0 ] + ty * self .tm_matrix [2 ]
237- self .tm_matrix [5 ] += tx * self .tm_matrix [1 ] + ty * self .tm_matrix [3 ]
238- str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
239- self ._actual_str_size ["str_widths" ] = 0.0
240- elif operator == b"Tm" : # Set text matrix
241- self .tm_matrix = [float (operand ) for operand in operands [:6 ]]
242- str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
243- self ._actual_str_size ["str_widths" ] = 0.0
244- elif operator == b"T*" : # Move to next line
245- self .tm_matrix [4 ] -= self .TL * self .tm_matrix [2 ]
246- self .tm_matrix [5 ] -= self .TL * self .tm_matrix [3 ]
247- str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
248- self ._actual_str_size ["str_widths" ] = 0.0
249- elif operator == b"Tj" : # Show text
250- self .text , self .rtl_dir , self ._actual_str_size = self ._handle_tj (
140+ # Post-process operations that affect text positioning
141+ if operator in {b"Td" , b"Tm" , b"T*" , b"Tj" }:
142+ self ._post_process_text_operation (str_widths or 0.0 )
143+
144+ def _post_process_text_operation (self , str_widths : float ) -> None :
145+ """Handle common post-processing for text positioning operations."""
146+ try :
147+ self .text , self .output , self .cm_prev , self .tm_prev = crlf_space_check (
251148 self .text ,
252- operands ,
253- self .cm_matrix ,
254- self .tm_matrix ,
149+ ( self . cm_prev , self . tm_prev ) ,
150+ ( self .cm_matrix , self . tm_matrix ) ,
151+ ( self .memo_cm , self . memo_tm ) ,
255152 self .cmap ,
256153 self .orientations ,
154+ self .output ,
257155 self .font_size ,
258- self .rtl_dir ,
259156 self .visitor_text ,
260- self ._space_width ,
261- self ._actual_str_size ,
157+ str_widths ,
158+ self .compute_str_widths (self ._actual_str_size ["space_width" ]),
159+ self ._actual_str_size ["str_height" ],
262160 )
263- else :
264- return
265-
266- if operator in {b"Td" , b"Tm" , b"T*" , b"Tj" }:
267- try :
268- self .text , self .output , self .cm_prev , self .tm_prev = crlf_space_check (
269- self .text ,
270- (self .cm_prev , self .tm_prev ),
271- (self .cm_matrix , self .tm_matrix ),
272- (self .memo_cm , self .memo_tm ),
273- self .cmap ,
274- self .orientations ,
275- self .output ,
276- self .font_size ,
277- self .visitor_text ,
278- str_widths ,
279- self .compute_str_widths (self ._actual_str_size ["space_width" ]),
280- self ._actual_str_size ["str_height" ],
281- )
282- if self .text == "" :
283- self .memo_cm = self .cm_matrix .copy ()
284- self .memo_tm = self .tm_matrix .copy ()
285- except OrientationNotFoundError :
286- return
161+ if self .text == "" :
162+ self .memo_cm = self .cm_matrix .copy ()
163+ self .memo_tm = self .tm_matrix .copy ()
164+ except OrientationNotFoundError :
165+ pass
287166
288167 def _get_actual_font_widths (
289168 self ,
@@ -357,3 +236,165 @@ def _handle_tj(
357236 actual_str_size ["str_widths" ] += font_widths
358237
359238 return text , rtl_dir , actual_str_size
239+
240+ def _flush_text (self ) -> None :
241+ """Flush accumulated text to output and call visitor if present."""
242+ self .output += self .text
243+ if self .visitor_text is not None :
244+ self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
245+ self .text = ""
246+ self .memo_cm = self .cm_matrix .copy ()
247+ self .memo_tm = self .tm_matrix .copy ()
248+
249+ # Operation handlers
250+
251+ def _handle_bt (self , operands : List [Any ]) -> None :
252+ """Handle BT (Begin Text) operation - Table 5.4 page 405."""
253+ self .tm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
254+ self ._flush_text ()
255+
256+ def _handle_et (self , operands : List [Any ]) -> None :
257+ """Handle ET (End Text) operation - Table 5.4 page 405."""
258+ self ._flush_text ()
259+
260+ def _handle_save_graphics_state (self , operands : List [Any ]) -> None :
261+ """Handle q (Save graphics state) operation - Table 4.7 page 219."""
262+ self .cm_stack .append (
263+ (
264+ self .cm_matrix ,
265+ self .cmap ,
266+ self .font_size ,
267+ self .char_scale ,
268+ self .space_scale ,
269+ self ._space_width ,
270+ self .TL ,
271+ )
272+ )
273+
274+ def _handle_restore_graphics_state (self , operands : List [Any ]) -> None :
275+ """Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
276+ try :
277+ (
278+ self .cm_matrix ,
279+ self .cmap ,
280+ self .font_size ,
281+ self .char_scale ,
282+ self .space_scale ,
283+ self ._space_width ,
284+ self .TL ,
285+ ) = self .cm_stack .pop ()
286+ except Exception :
287+ self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
288+
289+ def _handle_cm (self , operands : List [Any ]) -> None :
290+ """Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
291+ self .output += self .text
292+ if self .visitor_text is not None :
293+ self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
294+ self .text = ""
295+ try :
296+ self .cm_matrix = mult ([float (operand ) for operand in operands [:6 ]], self .cm_matrix )
297+ except Exception :
298+ self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
299+ self .memo_cm = self .cm_matrix .copy ()
300+ self .memo_tm = self .tm_matrix .copy ()
301+
302+ def _handle_tz (self , operands : List [Any ]) -> None :
303+ """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
304+ self .char_scale = float (operands [0 ]) / 100 if operands else 1.0
305+
306+ def _handle_tw (self , operands : List [Any ]) -> None :
307+ """Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
308+ self .space_scale = 1.0 + float (operands [0 ] if operands else 0.0 )
309+
310+ def _handle_tl (self , operands : List [Any ]) -> None :
311+ """Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
312+ scale_x = math .sqrt (self .tm_matrix [0 ] ** 2 + self .tm_matrix [2 ] ** 2 )
313+ self .TL = float (operands [0 ] if operands else 0.0 ) * self .font_size * scale_x
314+
315+ def _handle_tf (self , operands : List [Any ]) -> None :
316+ """Handle Tf (Set font size) operation - Table 5.2 page 398."""
317+ if self .text != "" :
318+ self .output += self .text # .translate(cmap)
319+ if self .visitor_text is not None :
320+ self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
321+ self .text = ""
322+ self .memo_cm = self .cm_matrix .copy ()
323+ self .memo_tm = self .tm_matrix .copy ()
324+ try :
325+ # Import here to avoid circular imports
326+ from .._cmap import unknown_char_map # noqa: PLC0415
327+
328+ # char_map_tuple: font_type,
329+ # float(sp_width / 2),
330+ # encoding,
331+ # map_dict,
332+ # font_dict (describes the font)
333+ char_map_tuple = self .cmaps [operands [0 ]]
334+ # current cmap: encoding,
335+ # map_dict,
336+ # font resource name (internal name, not the real font name),
337+ # font_dict
338+ self .cmap = (
339+ char_map_tuple [2 ],
340+ char_map_tuple [3 ],
341+ operands [0 ],
342+ char_map_tuple [4 ],
343+ )
344+ self ._space_width = char_map_tuple [1 ]
345+ except KeyError : # font not found
346+ self .cmap = (
347+ unknown_char_map [2 ],
348+ unknown_char_map [3 ],
349+ f"???{ operands [0 ]} " ,
350+ None ,
351+ )
352+ self ._space_width = unknown_char_map [1 ]
353+ try :
354+ self .font_size = float (operands [1 ])
355+ except Exception :
356+ pass # keep previous size
357+
358+ def _handle_td (self , operands : List [Any ]) -> float :
359+ """Handle Td (Move text position) operation - Table 5.5 page 406."""
360+ # A special case is a translating only tm:
361+ # tm = [1, 0, 0, 1, e, f]
362+ # i.e. tm[4] += tx, tm[5] += ty.
363+ tx , ty = float (operands [0 ]), float (operands [1 ])
364+ self .tm_matrix [4 ] += tx * self .tm_matrix [0 ] + ty * self .tm_matrix [2 ]
365+ self .tm_matrix [5 ] += tx * self .tm_matrix [1 ] + ty * self .tm_matrix [3 ]
366+ str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
367+ self ._actual_str_size ["str_widths" ] = 0.0
368+ return str_widths
369+
370+ def _handle_tm (self , operands : List [Any ]) -> float :
371+ """Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
372+ self .tm_matrix = [float (operand ) for operand in operands [:6 ]]
373+ str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
374+ self ._actual_str_size ["str_widths" ] = 0.0
375+ return str_widths
376+
377+ def _handle_t_star (self , operands : List [Any ]) -> float :
378+ """Handle T* (Move to next line) operation - Table 5.5 page 406."""
379+ self .tm_matrix [4 ] -= self .TL * self .tm_matrix [2 ]
380+ self .tm_matrix [5 ] -= self .TL * self .tm_matrix [3 ]
381+ str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
382+ self ._actual_str_size ["str_widths" ] = 0.0
383+ return str_widths
384+
385+ def _handle_tj_operation (self , operands : List [Any ]) -> float :
386+ """Handle Tj (Show text) operation - Table 5.5 page 406."""
387+ self .text , self .rtl_dir , self ._actual_str_size = self ._handle_tj (
388+ self .text ,
389+ operands ,
390+ self .cm_matrix ,
391+ self .tm_matrix ,
392+ self .cmap ,
393+ self .orientations ,
394+ self .font_size ,
395+ self .rtl_dir ,
396+ self .visitor_text ,
397+ self ._space_width ,
398+ self ._actual_str_size ,
399+ )
400+ return 0.0 # str_widths will be handled in post-processing
0 commit comments