Skip to content

Commit 24b81eb

Browse files
ENH: Allow filtering by font when removing text (#3216)
1 parent 4716589 commit 24b81eb

File tree

2 files changed

+66
-7
lines changed

2 files changed

+66
-7
lines changed

pypdf/_writer.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,6 +2042,7 @@ def remove_objects_from_page(
20422042
self,
20432043
page: Union[PageObject, DictionaryObject],
20442044
to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
2045+
text_filters: Optional[Dict[str, Any]] = None
20452046
) -> None:
20462047
"""
20472048
Remove objects specified by ``to_delete`` from the given page.
@@ -2050,6 +2051,10 @@ def remove_objects_from_page(
20502051
page: Page object to clean up.
20512052
to_delete: Objects to be deleted; can be a ``ObjectDeletionFlag``
20522053
or a list of ObjectDeletionFlag
2054+
text_filters: Properties of text to be deleted, if applicable. Optional.
2055+
This is a Python dictionary with the following properties:
2056+
2057+
* font_ids: List of font IDs (such as /F1 or /T1_0) to be deleted.
20532058
20542059
"""
20552060
if isinstance(to_delete, (list, tuple)):
@@ -2083,11 +2088,24 @@ def remove_objects_from_page(
20832088
if to_delete & ObjectDeletionFlag.TEXT:
20842089
jump_operators = [b"Tj", b"TJ", b"'", b'"']
20852090

2086-
def clean(content: ContentStream, images: List[str], forms: List[str]) -> None:
2091+
def clean(
2092+
content: ContentStream,
2093+
images: List[str],
2094+
forms: List[str],
2095+
text_filters: Optional[Dict[str, Any]] = None
2096+
) -> None:
20872097
nonlocal jump_operators, to_delete
2098+
2099+
font_id = None
2100+
font_ids_to_delete = []
2101+
if text_filters and to_delete & ObjectDeletionFlag.TEXT:
2102+
font_ids_to_delete = text_filters.get("font_ids", [])
2103+
20882104
i = 0
20892105
while i < len(content.operations):
20902106
operands, operator = content.operations[i]
2107+
if operator == b"Tf":
2108+
font_id = operands[0]
20912109
if (
20922110
(
20932111
operator == b"INLINE IMAGE"
@@ -2100,7 +2118,13 @@ def clean(content: ContentStream, images: List[str], forms: List[str]) -> None:
21002118
and (operands[0] in images)
21012119
)
21022120
):
2103-
del content.operations[i]
2121+
if (
2122+
not to_delete & ObjectDeletionFlag.TEXT
2123+
or (not font_ids_to_delete or font_id in font_ids_to_delete)
2124+
):
2125+
del content.operations[i]
2126+
else:
2127+
i += 1
21042128
else:
21052129
i += 1
21062130
content.get_data() # this ensures ._data is rebuilt from the .operations
@@ -2173,7 +2197,7 @@ def clean_forms(
21732197
e = ContentStream(elt, self)
21742198
e.update(elt.items())
21752199
elt = e
2176-
clean(elt, images, forms) # clean the content
2200+
clean(elt, images, forms, text_filters) # clean the content
21772201
return images, forms
21782202

21792203
if not isinstance(page, PageObject):
@@ -2183,7 +2207,7 @@ def clean_forms(
21832207

21842208
images, forms = clean_forms(page, [])
21852209

2186-
clean(content, images, forms)
2210+
clean(content, images, forms, text_filters)
21872211
page.replace_contents(content)
21882212

21892213
def remove_images(
@@ -2210,10 +2234,29 @@ def remove_images(
22102234
for page in self.pages:
22112235
self.remove_objects_from_page(page, i)
22122236

2213-
def remove_text(self) -> None:
2214-
"""Remove text from this output."""
2237+
def remove_text(self, font_names: Optional[List[str]] = None) -> None:
2238+
"""
2239+
Remove text from the PDF.
2240+
2241+
Args:
2242+
font_names: List of font names to remove, such as "Helvetica-Bold".
2243+
Optional. If not specified, all text will be removed.
2244+
"""
2245+
if not font_names:
2246+
font_names = []
2247+
22152248
for page in self.pages:
2216-
self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT)
2249+
font_ids = []
2250+
fonts = page.get("/Resources", {}).get("/Font", {})
2251+
for font_id, font_info in fonts.items():
2252+
font_name = font_info.get("/BaseFont", "").split("+")[-1]
2253+
if font_name in font_names:
2254+
font_ids.append(font_id)
2255+
2256+
text_filters = {
2257+
"font_ids": font_ids,
2258+
}
2259+
self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT, text_filters=text_filters)
22172260

22182261
def add_uri(
22192262
self,

tests/test_writer.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,22 @@ def test_new_removes():
13861386
assert b"Chap" not in bb
13871387
assert b" TJ" not in bb
13881388

1389+
writer = PdfWriter()
1390+
writer.clone_document_from_reader(reader)
1391+
b = BytesIO()
1392+
writer.write(b)
1393+
reader = PdfReader(b)
1394+
text = reader.pages[0].extract_text()
1395+
assert "Arbeitsschritt" in text
1396+
assert "Modelltechnik" in text
1397+
writer.remove_text(font_names=["LiberationSans-Bold"])
1398+
b = BytesIO()
1399+
writer.write(b)
1400+
reader = PdfReader(b)
1401+
text = reader.pages[0].extract_text()
1402+
assert "Arbeitsschritt" not in text
1403+
assert "Modelltechnik" in text
1404+
13891405
url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf"
13901406
name = "GeoBaseWithComments.pdf"
13911407
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

0 commit comments

Comments
 (0)