@@ -2042,6 +2042,7 @@ def remove_objects_from_page(
20422042 self ,
20432043 page : Union [PageObject , DictionaryObject ],
20442044 to_delete : Union [ObjectDeletionFlag , Iterable [ObjectDeletionFlag ]],
2045+ text_filters : Optional [Dict [str , Any ]] = None
20452046 ) -> None :
20462047 """
20472048 Remove objects specified by ``to_delete`` from the given page.
@@ -2050,6 +2051,10 @@ def remove_objects_from_page(
20502051 page: Page object to clean up.
20512052 to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``
20522053 or a list of ObjectDeletionFlag
2054+ text_filters: Properties of text to be deleted, if applicable. Optional.
2055+ This is a Python dictionary with the following properties:
2056+
2057+ * font_ids: List of font IDs (such as /F1 or /T1_0) to be deleted.
20532058
20542059 """
20552060 if isinstance (to_delete , (list , tuple )):
@@ -2083,11 +2088,24 @@ def remove_objects_from_page(
20832088 if to_delete & ObjectDeletionFlag .TEXT :
20842089 jump_operators = [b"Tj" , b"TJ" , b"'" , b'"' ]
20852090
2086- def clean (content : ContentStream , images : List [str ], forms : List [str ]) -> None :
2091+ def clean (
2092+ content : ContentStream ,
2093+ images : List [str ],
2094+ forms : List [str ],
2095+ text_filters : Optional [Dict [str , Any ]] = None
2096+ ) -> None :
20872097 nonlocal jump_operators , to_delete
2098+
2099+ font_id = None
2100+ font_ids_to_delete = []
2101+ if text_filters and to_delete & ObjectDeletionFlag .TEXT :
2102+ font_ids_to_delete = text_filters .get ("font_ids" , [])
2103+
20882104 i = 0
20892105 while i < len (content .operations ):
20902106 operands , operator = content .operations [i ]
2107+ if operator == b"Tf" :
2108+ font_id = operands [0 ]
20912109 if (
20922110 (
20932111 operator == b"INLINE IMAGE"
@@ -2100,7 +2118,13 @@ def clean(content: ContentStream, images: List[str], forms: List[str]) -> None:
21002118 and (operands [0 ] in images )
21012119 )
21022120 ):
2103- del content .operations [i ]
2121+ if (
2122+ not to_delete & ObjectDeletionFlag .TEXT
2123+ or (not font_ids_to_delete or font_id in font_ids_to_delete )
2124+ ):
2125+ del content .operations [i ]
2126+ else :
2127+ i += 1
21042128 else :
21052129 i += 1
21062130 content .get_data () # this ensures ._data is rebuilt from the .operations
@@ -2173,7 +2197,7 @@ def clean_forms(
21732197 e = ContentStream (elt , self )
21742198 e .update (elt .items ())
21752199 elt = e
2176- clean (elt , images , forms ) # clean the content
2200+ clean (elt , images , forms , text_filters ) # clean the content
21772201 return images , forms
21782202
21792203 if not isinstance (page , PageObject ):
@@ -2183,7 +2207,7 @@ def clean_forms(
21832207
21842208 images , forms = clean_forms (page , [])
21852209
2186- clean (content , images , forms )
2210+ clean (content , images , forms , text_filters )
21872211 page .replace_contents (content )
21882212
21892213 def remove_images (
@@ -2210,10 +2234,29 @@ def remove_images(
22102234 for page in self .pages :
22112235 self .remove_objects_from_page (page , i )
22122236
2213- def remove_text (self ) -> None :
2214- """Remove text from this output."""
2237+ def remove_text (self , font_names : Optional [List [str ]] = None ) -> None :
2238+ """
2239+ Remove text from the PDF.
2240+
2241+ Args:
2242+ font_names: List of font names to remove, such as "Helvetica-Bold".
2243+ Optional. If not specified, all text will be removed.
2244+ """
2245+ if not font_names :
2246+ font_names = []
2247+
22152248 for page in self .pages :
2216- self .remove_objects_from_page (page , ObjectDeletionFlag .TEXT )
2249+ font_ids = []
2250+ fonts = page .get ("/Resources" , {}).get ("/Font" , {})
2251+ for font_id , font_info in fonts .items ():
2252+ font_name = font_info .get ("/BaseFont" , "" ).split ("+" )[- 1 ]
2253+ if font_name in font_names :
2254+ font_ids .append (font_id )
2255+
2256+ text_filters = {
2257+ "font_ids" : font_ids ,
2258+ }
2259+ self .remove_objects_from_page (page , ObjectDeletionFlag .TEXT , text_filters = text_filters )
22172260
22182261 def add_uri (
22192262 self ,
0 commit comments