Skip to content

Commit ef81d5a

Browse files
ENH: Allow deleting embedded files (#3461)
Closes #3378.
1 parent c2b3bb1 commit ef81d5a

File tree

3 files changed

+58
-5
lines changed

3 files changed

+58
-5
lines changed

docs/user/handle-attachments.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,16 @@ embedded_file.write("output.pdf")
7171

7272
The same functionality is available if you iterate over the attachments of a writer
7373
using `writer.attachment_list`.
74+
75+
## Delete Attachments
76+
77+
To delete an existing attachment, use the following code:
78+
79+
```python
80+
from pypdf import PdfWriter
81+
82+
writer = PdfWriter(clone_from="example.pdf")
83+
attachment = writer.add_attachment(filename="test.txt", data=b"Hello World!")
84+
attachment.delete()
85+
assert list(writer.attachment_list) == []
86+
```

pypdf/generic/_files.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pypdf.constants import CatalogAttributes as CA
88
from pypdf.constants import FileSpecificationDictionaryEntries
99
from pypdf.constants import PageAttributes as PG
10-
from pypdf.errors import PdfReadError
10+
from pypdf.errors import PdfReadError, PyPdfError
1111
from pypdf.generic import (
1212
ArrayObject,
1313
ByteStringObject,
@@ -36,14 +36,16 @@ class EmbeddedFile:
3636
3737
Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification.
3838
"""
39-
def __init__(self, name: str, pdf_object: DictionaryObject) -> None:
39+
def __init__(self, name: str, pdf_object: DictionaryObject, parent: ArrayObject | None = None) -> None:
4040
"""
4141
Args:
4242
name: The (primary) name as provided in the name tree.
4343
pdf_object: The corresponding PDF object to allow retrieving further data.
44+
parent: The parent list.
4445
"""
4546
self._name = name
4647
self.pdf_object = pdf_object
48+
self._parent = parent
4749

4850
@property
4951
def name(self) -> str:
@@ -105,7 +107,7 @@ def _create_new(cls, writer: PdfWriter, name: str, content: str | bytes) -> Embe
105107
names_array.extend([create_string_object(name), filespec])
106108

107109
# Return an EmbeddedFile instance
108-
return cls(name=name, pdf_object=filespec)
110+
return cls(name=name, pdf_object=filespec, parent=names_array)
109111

110112
@property
111113
def alternative_name(self) -> str | None:
@@ -276,6 +278,17 @@ def checksum(self, value: ByteStringObject | None) -> None:
276278
else:
277279
params[NameObject("/CheckSum")] = value
278280

281+
def delete(self) -> None:
282+
"""Delete the file from the document."""
283+
if not self._parent:
284+
raise PyPdfError("Parent required to delete file from document.")
285+
if self.pdf_object not in self._parent:
286+
raise PyPdfError("File not found in parent object.")
287+
index = self._parent.index(self.pdf_object)
288+
self._parent.pop(index) # Reference.
289+
self._parent.pop(index - 1) # Name.
290+
self.pdf_object = DictionaryObject() # Invalidate.
291+
279292
def __repr__(self) -> str:
280293
return f"<{self.__class__.__name__} name={self.name!r}>"
281294

@@ -296,7 +309,7 @@ def _load_from_names(cls, names: ArrayObject) -> Generator[EmbeddedFile]:
296309
# Skip plain strings and retrieve them as `direct_name` by index.
297310
file_dictionary = name.get_object()
298311
direct_name = names[i - 1].get_object()
299-
yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary)
312+
yield EmbeddedFile(name=direct_name, pdf_object=file_dictionary, parent=names)
300313

301314
@classmethod
302315
def _load(cls, catalog: DictionaryObject) -> Generator[EmbeddedFile]:

tests/generic/test_files.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pytest
99

1010
from pypdf import PdfReader, PdfWriter
11-
from pypdf.errors import PdfReadError
11+
from pypdf.errors import PdfReadError, PyPdfError
1212
from pypdf.generic import (
1313
ByteStringObject,
1414
DictionaryObject,
@@ -394,3 +394,30 @@ def test_embedded_file_null_object_handling():
394394
assert embedded_file.subtype is None
395395
assert embedded_file.size is None
396396
assert embedded_file.checksum is None
397+
398+
399+
def test_embedded_file__delete_without_parent():
400+
attachment = EmbeddedFile(name="test.txt", pdf_object=DictionaryObject())
401+
with pytest.raises(PyPdfError, match=r"^Parent required to delete file from document\.$"):
402+
attachment.delete()
403+
404+
405+
def test_embedded_file__delete_known():
406+
writer = PdfWriter()
407+
writer.add_blank_page(100, 100)
408+
attachment = writer.add_attachment("test.txt", b"content")
409+
writer.add_attachment("test2.txt", b"content2")
410+
411+
attachments = list(writer.attachment_list)
412+
assert len(attachments) == 2
413+
attachment.delete()
414+
with pytest.raises(PdfReadError, match=r"^/EF entry not found: {}$"):
415+
_ = attachment.content
416+
417+
attachments = list(writer.attachment_list)
418+
assert len(attachments) == 1
419+
assert attachments[0].name == "test2.txt"
420+
421+
# Delete second time.
422+
with pytest.raises(PyPdfError, match=r"^File not found in parent object\.$"):
423+
attachment.delete()

0 commit comments

Comments
 (0)