diff --git a/docs/user/metadata.md b/docs/user/metadata.md index 1b94fdb8d..0b4ff37db 100644 --- a/docs/user/metadata.md +++ b/docs/user/metadata.md @@ -121,6 +121,141 @@ if meta: print(meta.xmp_create_date) ``` +## Creating XMP metadata + +You can create XMP metadata easily using the `XmpInformation.create()` method: + +```python +from pypdf import PdfWriter +from pypdf.xmp import XmpInformation + +# Create a new XMP metadata object +xmp = XmpInformation.create() + +# Set metadata fields +xmp.dc_title = {"x-default": "My Document Title"} +xmp.dc_creator = ["Author One", "Author Two"] +xmp.dc_description = {"x-default": "Document description"} +xmp.dc_subject = ["keyword1", "keyword2", "keyword3"] +xmp.pdf_producer = "pypdf" + +# Create a writer and add the metadata +writer = PdfWriter() +writer.add_blank_page(612, 792) # Add a page +writer.xmp_metadata = xmp +writer.write("output.pdf") +``` + +## Setting XMP metadata fields + +The `XmpInformation` class provides property-based access for all supported metadata fields: + +### Dublin Core fields + +```python +from datetime import datetime +from pypdf.xmp import XmpInformation + +xmp = XmpInformation.create() + +# Single value fields +xmp.dc_coverage = "Global coverage" +xmp.dc_format = "application/pdf" +xmp.dc_identifier = "unique-id-123" +xmp.dc_source = "Original Source" + +# Array fields (bags - unordered) +xmp.dc_contributor = ["Contributor One", "Contributor Two"] +xmp.dc_language = ["en", "fr", "de"] +xmp.dc_publisher = ["Publisher One"] +xmp.dc_relation = ["Related Doc 1", "Related Doc 2"] +xmp.dc_subject = ["keyword1", "keyword2"] +xmp.dc_type = ["Document", "Text"] + +# Sequence fields (ordered arrays) +xmp.dc_creator = ["Primary Author", "Secondary Author"] +xmp.dc_date = [datetime.now()] + +# Language alternative fields +xmp.dc_title = {"x-default": "Title", "en": "English Title", "fr": "Titre français"} +xmp.dc_description = {"x-default": "Description", "en": "English Description"} +xmp.dc_rights = {"x-default": "All rights reserved"} +``` + +### XMP fields + +```python +from datetime import datetime + +# Date fields accept both datetime objects and strings +xmp.xmp_create_date = datetime.now() +xmp.xmp_modify_date = "2023-12-25T10:30:45Z" +xmp.xmp_metadata_date = datetime.now() + +# Text field +xmp.xmp_creator_tool = "pypdf" +``` + +### PDF fields + +```python +xmp.pdf_keywords = "keyword1, keyword2, keyword3" +xmp.pdf_pdfversion = "1.4" +xmp.pdf_producer = "pypdf" +``` + +### XMP Media Management fields + +```python +xmp.xmpmm_document_id = "uuid:12345678-1234-1234-1234-123456789abc" +xmp.xmpmm_instance_id = "uuid:87654321-4321-4321-4321-cba987654321" +``` + +### PDF/A fields + +```python +xmp.pdfaid_part = "1" +xmp.pdfaid_conformance = "B" +``` + +### Clearing metadata fields + +You can clear any field by assigning `None`: + +```python +xmp.dc_title = None +xmp.dc_creator = None +xmp.pdf_producer = None +``` + +### Incrementally updating XMP metadata fields + +When modifying existing XMP metadata, it is often necessary to add or update individual entries while preserving existing values. The XMP properties return standard Python data structures that can be manipulated directly: + +```python +from pypdf.xmp import XmpInformation + +xmp = XmpInformation.create() + +# Language alternative fields return dictionaries +title = xmp.dc_title or {} +title["en"] = "English Title" +title["fr"] = "Titre français" +xmp.dc_title = title + +# Bag fields (unordered collections) return lists +subjects = xmp.dc_subject or [] +subjects.append("new_keyword") +xmp.dc_subject = subjects + +# Sequence fields (ordered collections) return lists +creators = xmp.dc_creator or [] +creators.append("New Author") +xmp.dc_creator = creators +``` + +This approach provides direct control over the data structures while maintaining the property-based interface. + ## Modifying XMP metadata Modifying XMP metadata is a bit more complicated. diff --git a/pypdf/errors.py b/pypdf/errors.py index 64b5da14b..8fec6e13a 100644 --- a/pypdf/errors.py +++ b/pypdf/errors.py @@ -68,3 +68,7 @@ class EmptyImageDataError(PyPdfError): class LimitReachedError(PyPdfError): """Raised when a limit is reached.""" + + +class XmpDocumentError(PyPdfError, RuntimeError): + """Raised when the XMP XML document context is invalid or missing.""" diff --git a/pypdf/xmp.py b/pypdf/xmp.py index c45f2d183..6d5d544d5 100644 --- a/pypdf/xmp.py +++ b/pypdf/xmp.py @@ -8,6 +8,7 @@ import decimal import re from collections.abc import Iterator +from io import StringIO from typing import ( Any, Callable, @@ -21,7 +22,7 @@ from ._protocols import XmpInformationProtocol from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement -from .errors import PdfReadError +from .errors import PdfReadError, XmpDocumentError from .generic import ContentStream, PdfObject RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" @@ -53,6 +54,16 @@ # PDF/A PDFAID_NAMESPACE = "http://www.aiim.org/pdfa/ns/id/" +# Internal mapping of namespace URI → prefix +_NAMESPACE_PREFIX_MAP = { + DC_NAMESPACE: "dc", + XMP_NAMESPACE: "xmp", + PDF_NAMESPACE: "pdf", + XMPMM_NAMESPACE: "xmpMM", + PDFAID_NAMESPACE: "pdfaid", + PDFX_NAMESPACE: "pdfx", +} + iso8601 = re.compile( """ (?P[0-9]{4}) @@ -75,6 +86,22 @@ K = TypeVar("K") +# Minimal XMP template +_MINIMAL_XMP = f""" + + + + + + +""" + def _identity(value: K) -> K: return value @@ -108,7 +135,7 @@ def _converter_date(value: str) -> datetime.datetime: def _generic_get( - element: XmlElement, self: "XmpInformation", list_type: str, converter: Callable[[Any], Any] = _identity + element: XmlElement, self: Any, list_type: str, converter: Callable[[Any], Any] = _identity ) -> Optional[list[str]]: containers = element.getElementsByTagNameNS(RDF_NAMESPACE, list_type) retval: list[Any] = [] @@ -122,10 +149,103 @@ def _generic_get( return None -def _getter_bag( - namespace: str, name: str -) -> Callable[["XmpInformation"], Optional[list[str]]]: - def get(self: "XmpInformation") -> Optional[list[str]]: +class XmpInformation(XmpInformationProtocol, PdfObject): + """ + An object that represents Extensible Metadata Platform (XMP) metadata. + Usually accessed by :py:attr:`xmp_metadata()`. + + Raises: + PdfReadError: if XML is invalid + + """ + + def __init__(self, stream: ContentStream) -> None: + self.stream = stream + try: + data = self.stream.get_data() + doc_root: Document = parseString(data) # noqa: S318 + except (AttributeError, ExpatError) as e: + raise PdfReadError(f"XML in XmpInformation was invalid: {e}") + self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( + RDF_NAMESPACE, "RDF" + )[0] + self.cache: dict[Any, Any] = {} + + @classmethod + def create(cls) -> "XmpInformation": + """ + Create a new XmpInformation object with minimal structure. + + Returns: + A new XmpInformation instance with empty metadata fields. + """ + stream = ContentStream(None, None) + stream.set_data(_MINIMAL_XMP.encode("utf-8")) + return cls(stream) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + deprecate_with_replacement( + "XmpInformation.write_to_stream", + "PdfWriter.xmp_metadata", + "6.0.0" + ) + if encryption_key is not None: # deprecated + deprecation_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + self.stream.write_to_stream(stream) + + def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: + for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: + attr = desc.getAttributeNodeNS(namespace, name) + if attr is not None: + yield attr + yield from desc.getElementsByTagNameNS(namespace, name) + + def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: + for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: + for i in range(desc.attributes.length): + attr = desc.attributes.item(i) + if attr and attr.namespaceURI == namespace: + yield attr + for child in desc.childNodes: + if child.namespaceURI == namespace: + yield child + + def _get_text(self, element: XmlElement) -> str: + text = "" + for child in element.childNodes: + if child.nodeType == child.TEXT_NODE: + text += child.data + return text + + def _get_single_value( + self, + namespace: str, + name: str, + converter: Callable[[str], Any] = _identity, + ) -> Optional[Any]: + cached = self.cache.get(namespace, {}).get(name) + if cached: + return cached + value = None + for element in self.get_element("", namespace, name): + if element.nodeType == element.ATTRIBUTE_NODE: + value = element.nodeValue + else: + value = self._get_text(element) + break + if value is not None: + value = converter(value) + ns_cache = self.cache.setdefault(namespace, {}) + ns_cache[name] = value + return value + + def _getter_bag(self, namespace: str, name: str) -> Optional[list[str]]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached @@ -140,17 +260,16 @@ def get(self: "XmpInformation") -> Optional[list[str]]: ns_cache[name] = retval return retval - return get - - -def _getter_seq( - namespace: str, name: str, converter: Callable[[Any], Any] = _identity -) -> Callable[["XmpInformation"], Optional[list[Any]]]: - def get(self: "XmpInformation") -> Optional[list[Any]]: + def _get_seq_values( + self, + namespace: str, + name: str, + converter: Callable[[Any], Any] = _identity, + ) -> Optional[list[Any]]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached - retval = [] + retval: list[Any] = [] for element in self.get_element("", namespace, name): if (seqs := _generic_get(element, self, list_type="Seq", converter=converter)) is not None: retval.extend(seqs) @@ -170,17 +289,11 @@ def get(self: "XmpInformation") -> Optional[list[Any]]: ns_cache[name] = retval return retval - return get - - -def _getter_langalt( - namespace: str, name: str -) -> Callable[["XmpInformation"], Optional[dict[Any, Any]]]: - def get(self: "XmpInformation") -> Optional[dict[Any, Any]]: + def _get_langalt_values(self, namespace: str, name: str) -> Optional[dict[Any, Any]]: cached = self.cache.get(namespace, {}).get(name) if cached: return cached - retval = {} + retval: dict[Any, Any] = {} for element in self.get_element("", namespace, name): alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") if len(alts): @@ -194,204 +307,260 @@ def get(self: "XmpInformation") -> Optional[dict[Any, Any]]: ns_cache[name] = retval return retval - return get + @property + def dc_contributor(self) -> Optional[list[str]]: + """Contributors to the resource (other than the authors).""" + return self._getter_bag(DC_NAMESPACE, "contributor") + @dc_contributor.setter + def dc_contributor(self, values: Optional[list[str]]) -> None: + self._set_bag_values(DC_NAMESPACE, "contributor", values) -def _getter_single( - namespace: str, name: str, converter: Callable[[str], Any] = _identity -) -> Callable[["XmpInformation"], Optional[Any]]: - def get(self: "XmpInformation") -> Optional[Any]: - cached = self.cache.get(namespace, {}).get(name) - if cached: - return cached - value = None - for element in self.get_element("", namespace, name): - if element.nodeType == element.ATTRIBUTE_NODE: - value = element.nodeValue - else: - value = self._get_text(element) - break - if value is not None: - value = converter(value) - ns_cache = self.cache.setdefault(namespace, {}) - ns_cache[name] = value - return value + @property + def dc_coverage(self) -> Optional[str]: + """Text describing the extent or scope of the resource.""" + return self._get_single_value(DC_NAMESPACE, "coverage") - return get + @dc_coverage.setter + def dc_coverage(self, value: Optional[str]) -> None: + self._set_single_value(DC_NAMESPACE, "coverage", value) + @property + def dc_creator(self) -> Optional[list[str]]: + """A sorted array of names of the authors of the resource, listed in order of precedence.""" + return self._get_seq_values(DC_NAMESPACE, "creator") -class XmpInformation(XmpInformationProtocol, PdfObject): - """ - An object that represents Extensible Metadata Platform (XMP) metadata. - Usually accessed by :py:attr:`xmp_metadata()`. + @dc_creator.setter + def dc_creator(self, values: Optional[list[str]]) -> None: + self._set_seq_values(DC_NAMESPACE, "creator", values) - Raises: - PdfReadError: if XML is invalid + @property + def dc_date(self) -> Optional[list[datetime.datetime]]: + """A sorted array of dates of significance to the resource. The dates and times are in UTC.""" + return self._get_seq_values(DC_NAMESPACE, "date", _converter_date) + + @dc_date.setter + def dc_date(self, values: Optional[list[Union[str, datetime.datetime]]]) -> None: + if values is None: + self._set_seq_values(DC_NAMESPACE, "date", None) + else: + date_strings = [] + for value in values: + if isinstance(value, datetime.datetime): + date_strings.append(value.strftime("%Y-%m-%dT%H:%M:%S.%fZ")) + else: + date_strings.append(str(value)) + self._set_seq_values(DC_NAMESPACE, "date", date_strings) - """ + @property + def dc_description(self) -> Optional[dict[str, str]]: + """A language-keyed dictionary of textual descriptions of the content of the resource.""" + return self._get_langalt_values(DC_NAMESPACE, "description") - def __init__(self, stream: ContentStream) -> None: - self.stream = stream - try: - data = self.stream.get_data() - doc_root: Document = parseString(data) # noqa: S318 - except (AttributeError, ExpatError) as e: - raise PdfReadError(f"XML in XmpInformation was invalid: {e}") - self.rdf_root: XmlElement = doc_root.getElementsByTagNameNS( - RDF_NAMESPACE, "RDF" - )[0] - self.cache: dict[Any, Any] = {} + @dc_description.setter + def dc_description(self, values: Optional[dict[str, str]]) -> None: + self._set_langalt_values(DC_NAMESPACE, "description", values) - def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] = None - ) -> None: - deprecate_with_replacement( - "XmpInformation.write_to_stream", - "PdfWriter.xmp_metadata", - "6.0.0" - ) - if encryption_key is not None: # deprecated - deprecation_no_replacement( - "the encryption_key parameter of write_to_stream", "5.0.0" - ) - self.stream.write_to_stream(stream) + @property + def dc_format(self) -> Optional[str]: + """The mime-type of the resource.""" + return self._get_single_value(DC_NAMESPACE, "format") - def get_element(self, about_uri: str, namespace: str, name: str) -> Iterator[Any]: - for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): - if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: - attr = desc.getAttributeNodeNS(namespace, name) - if attr is not None: - yield attr - yield from desc.getElementsByTagNameNS(namespace, name) + @dc_format.setter + def dc_format(self, value: Optional[str]) -> None: + self._set_single_value(DC_NAMESPACE, "format", value) - def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> Iterator[Any]: - for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): - if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: - for i in range(desc.attributes.length): - attr = desc.attributes.item(i) - if attr and attr.namespaceURI == namespace: - yield attr - for child in desc.childNodes: - if child.namespaceURI == namespace: - yield child + @property + def dc_identifier(self) -> Optional[str]: + """Unique identifier of the resource.""" + return self._get_single_value(DC_NAMESPACE, "identifier") - def _get_text(self, element: XmlElement) -> str: - text = "" - for child in element.childNodes: - if child.nodeType == child.TEXT_NODE: - text += child.data - return text + @dc_identifier.setter + def dc_identifier(self, value: Optional[str]) -> None: + self._set_single_value(DC_NAMESPACE, "identifier", value) - dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor")) - """ - Contributors to the resource (other than the authors). + @property + def dc_language(self) -> Optional[list[str]]: + """An unordered array specifying the languages used in the resource.""" + return self._getter_bag(DC_NAMESPACE, "language") - An unsorted array of names. - """ + @dc_language.setter + def dc_language(self, values: Optional[list[str]]) -> None: + self._set_bag_values(DC_NAMESPACE, "language", values) - dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage")) - """Text describing the extent or scope of the resource.""" + @property + def dc_publisher(self) -> Optional[list[str]]: + """An unordered array of publisher names.""" + return self._getter_bag(DC_NAMESPACE, "publisher") - dc_creator = property(_getter_seq(DC_NAMESPACE, "creator")) - """A sorted array of names of the authors of the resource, listed in order - of precedence.""" + @dc_publisher.setter + def dc_publisher(self, values: Optional[list[str]]) -> None: + self._set_bag_values(DC_NAMESPACE, "publisher", values) - dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) - """ - A sorted array of dates (datetime.datetime instances) of significance to - the resource. + @property + def dc_relation(self) -> Optional[list[str]]: + """An unordered array of text descriptions of relationships to other documents.""" + return self._getter_bag(DC_NAMESPACE, "relation") - The dates and times are in UTC. - """ + @dc_relation.setter + def dc_relation(self, values: Optional[list[str]]) -> None: + self._set_bag_values(DC_NAMESPACE, "relation", values) - dc_description = property(_getter_langalt(DC_NAMESPACE, "description")) - """A language-keyed dictionary of textual descriptions of the content of the - resource.""" + @property + def dc_rights(self) -> Optional[dict[str, str]]: + """A language-keyed dictionary of textual descriptions of the rights the user has to this resource.""" + return self._get_langalt_values(DC_NAMESPACE, "rights") - dc_format = property(_getter_single(DC_NAMESPACE, "format")) - """The mime-type of the resource.""" + @dc_rights.setter + def dc_rights(self, values: Optional[dict[str, str]]) -> None: + self._set_langalt_values(DC_NAMESPACE, "rights", values) - dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier")) - """Unique identifier of the resource.""" + @property + def dc_source(self) -> Optional[str]: + """Unique identifier of the work from which this resource was derived.""" + return self._get_single_value(DC_NAMESPACE, "source") - dc_language = property(_getter_bag(DC_NAMESPACE, "language")) - """An unordered array specifying the languages used in the resource.""" + @dc_source.setter + def dc_source(self, value: Optional[str]) -> None: + self._set_single_value(DC_NAMESPACE, "source", value) - dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher")) - """An unordered array of publisher names.""" + @property + def dc_subject(self) -> Optional[list[str]]: + """An unordered array of descriptive phrases or keywords that specify the topic of the content.""" + return self._getter_bag(DC_NAMESPACE, "subject") - dc_relation = property(_getter_bag(DC_NAMESPACE, "relation")) - """An unordered array of text descriptions of relationships to other - documents.""" + @dc_subject.setter + def dc_subject(self, values: Optional[list[str]]) -> None: + self._set_bag_values(DC_NAMESPACE, "subject", values) - dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights")) - """A language-keyed dictionary of textual descriptions of the rights the - user has to this resource.""" + @property + def dc_title(self) -> Optional[dict[str, str]]: + """A language-keyed dictionary of the title of the resource.""" + return self._get_langalt_values(DC_NAMESPACE, "title") - dc_source = property(_getter_single(DC_NAMESPACE, "source")) - """Unique identifier of the work from which this resource was derived.""" + @dc_title.setter + def dc_title(self, values: Optional[dict[str, str]]) -> None: + self._set_langalt_values(DC_NAMESPACE, "title", values) - dc_subject = property(_getter_bag(DC_NAMESPACE, "subject")) - """An unordered array of descriptive phrases or keywords that specify the - topic of the content of the resource.""" + @property + def dc_type(self) -> Optional[list[str]]: + """An unordered array of textual descriptions of the document type.""" + return self._getter_bag(DC_NAMESPACE, "type") - dc_title = property(_getter_langalt(DC_NAMESPACE, "title")) - """A language-keyed dictionary of the title of the resource.""" + @dc_type.setter + def dc_type(self, values: Optional[list[str]]) -> None: + self._set_bag_values(DC_NAMESPACE, "type", values) - dc_type = property(_getter_bag(DC_NAMESPACE, "type")) - """An unordered array of textual descriptions of the document type.""" + @property + def pdf_keywords(self) -> Optional[str]: + """An unformatted text string representing document keywords.""" + return self._get_single_value(PDF_NAMESPACE, "Keywords") - pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords")) - """An unformatted text string representing document keywords.""" + @pdf_keywords.setter + def pdf_keywords(self, value: Optional[str]) -> None: + self._set_single_value(PDF_NAMESPACE, "Keywords", value) - pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion")) - """The PDF file version, for example 1.0 or 1.3.""" + @property + def pdf_pdfversion(self) -> Optional[str]: + """The PDF file version, for example 1.0 or 1.3.""" + return self._get_single_value(PDF_NAMESPACE, "PDFVersion") - pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer")) - """The name of the tool that saved the document as a PDF.""" + @pdf_pdfversion.setter + def pdf_pdfversion(self, value: Optional[str]) -> None: + self._set_single_value(PDF_NAMESPACE, "PDFVersion", value) - xmp_create_date = property( - _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date) - ) - """ - The date and time the resource was originally created. + @property + def pdf_producer(self) -> Optional[str]: + """The name of the tool that saved the document as a PDF.""" + return self._get_single_value(PDF_NAMESPACE, "Producer") - The date and time are returned as a UTC datetime.datetime object. - """ + @pdf_producer.setter + def pdf_producer(self, value: Optional[str]) -> None: + self._set_single_value(PDF_NAMESPACE, "Producer", value) - xmp_modify_date = property( - _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date) - ) - """ - The date and time the resource was last modified. + @property + def xmp_create_date(self) -> Optional[datetime.datetime]: + """The date and time the resource was originally created. Returned as a UTC datetime object.""" + return self._get_single_value(XMP_NAMESPACE, "CreateDate", _converter_date) + + @xmp_create_date.setter + def xmp_create_date(self, value: Optional[datetime.datetime]) -> None: + if value: + date_str = value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + self._set_single_value(XMP_NAMESPACE, "CreateDate", date_str) + else: + self._set_single_value(XMP_NAMESPACE, "CreateDate", None) - The date and time are returned as a UTC datetime.datetime object. - """ + @property + def xmp_modify_date(self) -> Optional[datetime.datetime]: + """The date and time the resource was last modified. Returned as a UTC datetime object.""" + return self._get_single_value(XMP_NAMESPACE, "ModifyDate", _converter_date) + + @xmp_modify_date.setter + def xmp_modify_date(self, value: Optional[datetime.datetime]) -> None: + if value: + date_str = value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + self._set_single_value(XMP_NAMESPACE, "ModifyDate", date_str) + else: + self._set_single_value(XMP_NAMESPACE, "ModifyDate", None) - xmp_metadata_date = property( - _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date) - ) - """ - The date and time that any metadata for this resource was last changed. + @property + def xmp_metadata_date(self) -> Optional[datetime.datetime]: + """The date and time that any metadata for this resource was last changed. Returned as a UTC datetime object.""" + return self._get_single_value(XMP_NAMESPACE, "MetadataDate", _converter_date) + + @xmp_metadata_date.setter + def xmp_metadata_date(self, value: Optional[datetime.datetime]) -> None: + if value: + date_str = value.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + self._set_single_value(XMP_NAMESPACE, "MetadataDate", date_str) + else: + self._set_single_value(XMP_NAMESPACE, "MetadataDate", None) - The date and time are returned as a UTC datetime.datetime object. - """ + @property + def xmp_creator_tool(self) -> Optional[str]: + """The name of the first known tool used to create the resource.""" + return self._get_single_value(XMP_NAMESPACE, "CreatorTool") - xmp_creator_tool = property(_getter_single(XMP_NAMESPACE, "CreatorTool")) - """The name of the first known tool used to create the resource.""" + @xmp_creator_tool.setter + def xmp_creator_tool(self, value: Optional[str]) -> None: + self._set_single_value(XMP_NAMESPACE, "CreatorTool", value) - xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) - """The common identifier for all versions and renditions of this resource.""" + @property + def xmpmm_document_id(self) -> Optional[str]: + """The common identifier for all versions and renditions of this resource.""" + return self._get_single_value(XMPMM_NAMESPACE, "DocumentID") - xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) - """An identifier for a specific incarnation of a document, updated each - time a file is saved.""" + @xmpmm_document_id.setter + def xmpmm_document_id(self, value: Optional[str]) -> None: + self._set_single_value(XMPMM_NAMESPACE, "DocumentID", value) - pdfaid_part = property(_getter_single(PDFAID_NAMESPACE, "part")) - """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" + @property + def xmpmm_instance_id(self) -> Optional[str]: + """An identifier for a specific incarnation of a document, updated each time a file is saved.""" + return self._get_single_value(XMPMM_NAMESPACE, "InstanceID") + + @xmpmm_instance_id.setter + def xmpmm_instance_id(self, value: Optional[str]) -> None: + self._set_single_value(XMPMM_NAMESPACE, "InstanceID", value) + + @property + def pdfaid_part(self) -> Optional[str]: + """The part of the PDF/A standard that the document conforms to (e.g., 1, 2, 3).""" + return self._get_single_value(PDFAID_NAMESPACE, "part") + + @pdfaid_part.setter + def pdfaid_part(self, value: Optional[str]) -> None: + self._set_single_value(PDFAID_NAMESPACE, "part", value) + + @property + def pdfaid_conformance(self) -> Optional[str]: + """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" + return self._get_single_value(PDFAID_NAMESPACE, "conformance") - pdfaid_conformance = property(_getter_single(PDFAID_NAMESPACE, "conformance")) - """The conformance level within the PDF/A standard (e.g., 'A', 'B', 'U').""" + @pdfaid_conformance.setter + def pdfaid_conformance(self, value: Optional[str]) -> None: + self._set_single_value(PDFAID_NAMESPACE, "conformance", value) @property def custom_properties(self) -> dict[Any, Any]: @@ -423,3 +592,149 @@ def custom_properties(self) -> dict[Any, Any]: value = self._get_text(node) self._custom_properties[key] = value return self._custom_properties + + def _get_or_create_description(self, about_uri: str = "") -> XmlElement: + """Get or create an rdf:Description element with the given about URI.""" + for desc in self.rdf_root.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): + if desc.getAttributeNS(RDF_NAMESPACE, "about") == about_uri: + return desc + + doc = self.rdf_root.ownerDocument + if doc is None: + raise XmpDocumentError("XMP Document is None") + desc = doc.createElementNS(RDF_NAMESPACE, "rdf:Description") + desc.setAttributeNS(RDF_NAMESPACE, "rdf:about", about_uri) + self.rdf_root.appendChild(desc) + return desc + + def _clear_cache_entry(self, namespace: str, name: str) -> None: + """Remove a cached value for a given namespace/name if present.""" + ns_cache = self.cache.get(namespace) + if ns_cache and name in ns_cache: + del ns_cache[name] + + def _set_single_value(self, namespace: str, name: str, value: Optional[str]) -> None: + """Set or remove a single metadata value.""" + self._clear_cache_entry(namespace, name) + desc = self._get_or_create_description() + + existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) + for elem in existing_elements: + desc.removeChild(elem) + + if existing_attr := desc.getAttributeNodeNS(namespace, name): + desc.removeAttributeNode(existing_attr) + + if value is not None: + doc = self.rdf_root.ownerDocument + if doc is None: + raise XmpDocumentError("XMP Document is None") + prefix = self._get_namespace_prefix(namespace) + elem = doc.createElementNS(namespace, f"{prefix}:{name}") + text_node = doc.createTextNode(str(value)) + elem.appendChild(text_node) + desc.appendChild(elem) + + self._update_stream() + + def _set_bag_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: + """Set or remove bag values (unordered array).""" + self._clear_cache_entry(namespace, name) + desc = self._get_or_create_description() + + existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) + for elem in existing_elements: + desc.removeChild(elem) + + if values: + doc = self.rdf_root.ownerDocument + if doc is None: + raise XmpDocumentError("XMP Document is None") + prefix = self._get_namespace_prefix(namespace) + elem = doc.createElementNS(namespace, f"{prefix}:{name}") + bag = doc.createElementNS(RDF_NAMESPACE, "rdf:Bag") + + for value in values: + li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") + text_node = doc.createTextNode(str(value)) + li.appendChild(text_node) + bag.appendChild(li) + + elem.appendChild(bag) + desc.appendChild(elem) + + self._update_stream() + + def _set_seq_values(self, namespace: str, name: str, values: Optional[list[str]]) -> None: + """Set or remove sequence values (ordered array).""" + self._clear_cache_entry(namespace, name) + desc = self._get_or_create_description() + + existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) + for elem in existing_elements: + desc.removeChild(elem) + + if values: + doc = self.rdf_root.ownerDocument + if doc is None: + raise XmpDocumentError("XMP Document is None") + prefix = self._get_namespace_prefix(namespace) + elem = doc.createElementNS(namespace, f"{prefix}:{name}") + seq = doc.createElementNS(RDF_NAMESPACE, "rdf:Seq") + + for value in values: + li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") + text_node = doc.createTextNode(str(value)) + li.appendChild(text_node) + seq.appendChild(li) + + elem.appendChild(seq) + desc.appendChild(elem) + + self._update_stream() + + def _set_langalt_values(self, namespace: str, name: str, values: Optional[dict[str, str]]) -> None: + """Set or remove language alternative values.""" + self._clear_cache_entry(namespace, name) + desc = self._get_or_create_description() + + existing_elements = list(desc.getElementsByTagNameNS(namespace, name)) + for elem in existing_elements: + desc.removeChild(elem) + + if values: + doc = self.rdf_root.ownerDocument + if doc is None: + raise XmpDocumentError("XMP Document is None") + prefix = self._get_namespace_prefix(namespace) + elem = doc.createElementNS(namespace, f"{prefix}:{name}") + alt = doc.createElementNS(RDF_NAMESPACE, "rdf:Alt") + + for lang, value in values.items(): + li = doc.createElementNS(RDF_NAMESPACE, "rdf:li") + li.setAttribute("xml:lang", lang) + text_node = doc.createTextNode(str(value)) + li.appendChild(text_node) + alt.appendChild(li) + + elem.appendChild(alt) + desc.appendChild(elem) + + self._update_stream() + + def _get_namespace_prefix(self, namespace: str) -> str: + """Get the appropriate namespace prefix for a given namespace URI.""" + return _NAMESPACE_PREFIX_MAP.get(namespace, "unknown") + + def _update_stream(self) -> None: + """Update the stream with the current XML content.""" + doc = self.rdf_root.ownerDocument + if doc is None: + raise XmpDocumentError("XMP Document is None") + + buffer = StringIO() + for child in doc.childNodes: + child.writexml(buffer, "", "", "") + xml_text = buffer.getvalue() + xml_text = xml_text.replace('', '') + self.stream.set_data(xml_text.encode("utf-8")) diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 593994f1a..53134afe1 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -251,31 +251,6 @@ def test_invalid_xmp_information_handling(): reader.xmp_metadata assert exc.value.args[0].startswith("XML in XmpInformation was invalid") - -def test_xmp_getter_bag_function(): - """xmp._getter_bag does not crash.""" - f = pypdf.xmp._getter_bag("namespace", "name") - - class Tst: # to replace pdf - strict = False - - reader = PdfReader(RESOURCE_ROOT / "commented-xmp.pdf") - xmp_info = reader.xmp_metadata - # - # - # - - # - # me - # - # - # - - assert xmp_info is not None - f(xmp_info) - - @pytest.mark.samples def test_pdfa_xmp_metadata_with_values(): """Test PDF/A XMP metadata extraction from a file with PDF/A metadata.""" @@ -415,3 +390,465 @@ def test_pdf_writer__xmp_metadata_setter(): reader = PdfReader(BytesIO(output_bytes)) assert get_all_tiff(reader.xmp_metadata) == {"tiff:Artist": ["Foo Bar"]} assert "/XML" not in str(writer.root_object) + + +def test_xmp_information__create(): + """Test XmpInformation.create() classmethod.""" + xmp = XmpInformation.create() + assert xmp is not None + assert xmp.dc_title == {} + assert xmp.dc_creator is None or xmp.dc_creator == [] + assert xmp.dc_description is None or xmp.dc_description == {} + assert xmp.xmp_create_date is None + assert xmp.pdf_producer is None + + +def test_xmp_information_set_dc_title(): + """Test setting dc:title metadata.""" + xmp = XmpInformation.create() + + title_values = {"x-default": "Test Title", "en": "Test Title EN"} + xmp.dc_title = title_values + assert xmp.dc_title == title_values + + xmp.dc_title = None + assert xmp.dc_title is None or xmp.dc_title == {} + + +def test_xmp_information_set_dc_creator(): + """Test setting dc:creator metadata.""" + xmp = XmpInformation.create() + + creators = ["Author One", "Author Two"] + xmp.dc_creator = creators + assert xmp.dc_creator == creators + + xmp.dc_creator = None + assert xmp.dc_creator is None or xmp.dc_creator == [] + + +def test_xmp_information_set_dc_description(): + """Test setting dc:description metadata.""" + xmp = XmpInformation.create() + + description_values = {"x-default": "Test Description", "en": "Test Description EN"} + xmp.dc_description = description_values + assert xmp.dc_description == description_values + + xmp.dc_description = None + assert xmp.dc_description is None or xmp.dc_description == {} + + +def test_xmp_information_set_dc_subject(): + """Test setting dc:subject metadata.""" + xmp = XmpInformation.create() + + subjects = ["keyword1", "keyword2", "keyword3"] + xmp.dc_subject = subjects + assert xmp.dc_subject == subjects + + xmp.dc_subject = None + assert xmp.dc_subject is None or xmp.dc_subject == [] + + +def test_xmp_information_set_dc_date(): + """Test setting dc:date metadata.""" + xmp = XmpInformation.create() + + test_date = datetime(2023, 12, 25, 10, 30, 45) + xmp.dc_date = [test_date] + stored_dates = xmp.dc_date + assert len(stored_dates) == 1 + + date_string = "2023-12-25T10:30:45.000000Z" + xmp.dc_date = [date_string] + stored_dates = xmp.dc_date + assert len(stored_dates) == 1 + + xmp.dc_date = None + assert xmp.dc_date is None or xmp.dc_date == [] + + +def test_xmp_information_set_single_fields(): + """Test setting single-value metadata fields.""" + xmp = XmpInformation.create() + + xmp.dc_coverage = "Global coverage" + assert xmp.dc_coverage == "Global coverage" + xmp.dc_coverage = None + assert xmp.dc_coverage is None + + xmp.dc_format = "application/pdf" + assert xmp.dc_format == "application/pdf" + xmp.dc_format = None + assert xmp.dc_format is None + + xmp.dc_identifier = "unique-id-123" + assert xmp.dc_identifier == "unique-id-123" + xmp.dc_identifier = None + assert xmp.dc_identifier is None + + xmp.dc_source = "Original Source" + assert xmp.dc_source == "Original Source" + xmp.dc_source = None + assert xmp.dc_source is None + + +def test_xmp_information_set_bag_fields(): + """Test setting bag (unordered array) metadata fields.""" + xmp = XmpInformation.create() + + contributors = ["Contributor One", "Contributor Two"] + xmp.dc_contributor = contributors + assert xmp.dc_contributor == contributors + xmp.dc_contributor = None + assert xmp.dc_contributor is None or xmp.dc_contributor == [] + + languages = ["en", "fr", "de"] + xmp.dc_language = languages + assert xmp.dc_language == languages + xmp.dc_language = None + assert xmp.dc_language is None or xmp.dc_language == [] + + publishers = ["Publisher One", "Publisher Two"] + xmp.dc_publisher = publishers + assert xmp.dc_publisher == publishers + xmp.dc_publisher = None + assert xmp.dc_publisher is None or xmp.dc_publisher == [] + + relations = ["Related Doc 1", "Related Doc 2"] + xmp.dc_relation = relations + assert xmp.dc_relation == relations + xmp.dc_relation = None + assert xmp.dc_relation is None or xmp.dc_relation == [] + + types = ["Document", "Text"] + xmp.dc_type = types + assert xmp.dc_type == types + xmp.dc_type = None + assert xmp.dc_type is None or xmp.dc_type == [] + + +def test_xmp_information_set_dc_rights(): + """Test setting dc:rights metadata.""" + xmp = XmpInformation.create() + + rights_values = {"x-default": "All rights reserved", "en": "All rights reserved EN"} + xmp.dc_rights = rights_values + assert xmp.dc_rights == rights_values + + xmp.dc_rights = None + assert xmp.dc_rights is None or xmp.dc_rights == {} + + +def test_xmp_information_set_pdf_fields(): + """Test setting PDF namespace metadata fields.""" + xmp = XmpInformation.create() + + xmp.pdf_keywords = "keyword1, keyword2, keyword3" + assert xmp.pdf_keywords == "keyword1, keyword2, keyword3" + xmp.pdf_keywords = None + assert xmp.pdf_keywords is None + + xmp.pdf_pdfversion = "1.4" + assert xmp.pdf_pdfversion == "1.4" + xmp.pdf_pdfversion = None + assert xmp.pdf_pdfversion is None + + xmp.pdf_producer = "pypdf" + assert xmp.pdf_producer == "pypdf" + xmp.pdf_producer = None + assert xmp.pdf_producer is None + + +def test_xmp_information_set_xmp_date_fields(): + """Test setting XMP date metadata fields.""" + xmp = XmpInformation.create() + test_date = datetime(2023, 12, 25, 10, 30, 45) + + xmp.xmp_create_date = test_date + stored_date = xmp.xmp_create_date + assert isinstance(stored_date, datetime) + xmp.xmp_create_date = None + assert xmp.xmp_create_date is None + + xmp.xmp_modify_date = test_date + stored_date = xmp.xmp_modify_date + assert isinstance(stored_date, datetime) + xmp.xmp_modify_date = None + assert xmp.xmp_modify_date is None + + xmp.xmp_metadata_date = test_date + stored_date = xmp.xmp_metadata_date + assert isinstance(stored_date, datetime) + xmp.xmp_metadata_date = None + assert xmp.xmp_metadata_date is None + + +def test_xmp_information_set_xmp_creator_tool(): + """Test setting xmp:CreatorTool metadata.""" + xmp = XmpInformation.create() + + xmp.xmp_creator_tool = "pypdf" + assert xmp.xmp_creator_tool == "pypdf" + xmp.xmp_creator_tool = None + assert xmp.xmp_creator_tool is None + + +def test_xmp_information_set_xmpmm_fields(): + """Test setting XMPMM namespace metadata fields.""" + xmp = XmpInformation.create() + + doc_id = "uuid:12345678-1234-1234-1234-123456789abc" + xmp.xmpmm_document_id = doc_id + assert xmp.xmpmm_document_id == doc_id + xmp.xmpmm_document_id = None + assert xmp.xmpmm_document_id is None + + instance_id = "uuid:87654321-4321-4321-4321-cba987654321" + xmp.xmpmm_instance_id = instance_id + assert xmp.xmpmm_instance_id == instance_id + xmp.xmpmm_instance_id = None + assert xmp.xmpmm_instance_id is None + + +def test_xmp_information_set_pdfaid_fields(): + """Test setting PDF/A ID namespace metadata fields.""" + xmp = XmpInformation.create() + + xmp.pdfaid_part = "1" + assert xmp.pdfaid_part == "1" + xmp.pdfaid_part = None + assert xmp.pdfaid_part is None + + xmp.pdfaid_conformance = "B" + assert xmp.pdfaid_conformance == "B" + xmp.pdfaid_conformance = None + assert xmp.pdfaid_conformance is None + + +def test_xmp_information_create_with_writer(): + """Test using XmpInformation.create() with PdfWriter.""" + xmp = XmpInformation.create() + xmp.dc_title = {"x-default": "Created with pypdf"} + xmp.dc_creator = ["pypdf user"] + xmp.pdf_producer = "pypdf library" + + writer = PdfWriter() + writer.add_blank_page(612, 792) + writer.xmp_metadata = xmp + + output = BytesIO() + writer.write(output) + output_bytes = output.getvalue() + + reader = PdfReader(BytesIO(output_bytes)) + xmp_read = reader.xmp_metadata + assert xmp_read is not None + assert xmp_read.dc_title == {"x-default": "Created with pypdf"} + assert xmp_read.dc_creator == ["pypdf user"] + assert xmp_read.pdf_producer == "pypdf library" + + +def test_xmp_information_namespace_prefix(): + """Test _get_namespace_prefix method.""" + xmp = XmpInformation.create() + + assert xmp._get_namespace_prefix(pypdf.xmp.DC_NAMESPACE) == "dc" + assert xmp._get_namespace_prefix(pypdf.xmp.XMP_NAMESPACE) == "xmp" + assert xmp._get_namespace_prefix(pypdf.xmp.PDF_NAMESPACE) == "pdf" + assert xmp._get_namespace_prefix(pypdf.xmp.XMPMM_NAMESPACE) == "xmpMM" + assert xmp._get_namespace_prefix(pypdf.xmp.PDFAID_NAMESPACE) == "pdfaid" + assert xmp._get_namespace_prefix(pypdf.xmp.PDFX_NAMESPACE) == "pdfx" + assert xmp._get_namespace_prefix("unknown://namespace") == "unknown" + + +def test_xmp_information_owner_document_none_errors(): + xmp = XmpInformation.create() + + original_owner = xmp.rdf_root.ownerDocument + + try: + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp._get_or_create_description() + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp._update_stream() + + xmp.rdf_root.ownerDocument = original_owner + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp.dc_coverage = "test coverage" + + xmp.rdf_root.ownerDocument = original_owner + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp.dc_contributor = ["contributor"] + + xmp.rdf_root.ownerDocument = original_owner + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp.dc_creator = ["creator"] + + xmp.rdf_root.ownerDocument = original_owner + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp.dc_title = {"x-default": "title"} + + xmp.rdf_root.ownerDocument = original_owner + desc = xmp._get_or_create_description() + desc.setAttribute("test-attr", "test-value") + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp._set_single_value("test-namespace", "test-attr", "new-value") + + xmp.rdf_root.ownerDocument = original_owner + desc = xmp._get_or_create_description() + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp._set_bag_values("test-namespace", "test-name", ["value"]) + + xmp.rdf_root.ownerDocument = original_owner + desc = xmp._get_or_create_description() + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp._set_seq_values("test-namespace", "test-name", ["value"]) + + xmp.rdf_root.ownerDocument = original_owner + desc = xmp._get_or_create_description() + xmp.rdf_root.ownerDocument = None + + with pytest.raises(RuntimeError, match="XMP Document is None"): + xmp._set_langalt_values("test-namespace", "test-name", {"x-default": "value"}) + + finally: + xmp.rdf_root.ownerDocument = original_owner + + +def test_xmp_information_remove_existing_attribute(): + xmp = XmpInformation.create() + + xmp.dc_coverage = "initial coverage" + assert xmp.dc_coverage == "initial coverage" + + xmp.dc_coverage = "updated coverage" + assert xmp.dc_coverage == "updated coverage" + + xmp.dc_coverage = None + assert xmp.dc_coverage is None + + +def test_xmp_information_edge_case_coverage(): + xmp = XmpInformation.create() + + xmp.dc_contributor = [] + assert xmp.dc_contributor == [] + + xmp.dc_creator = [] + assert xmp.dc_creator == [] + + xmp.dc_title = {} + assert xmp.dc_title == {} + + xmp.dc_contributor = None + assert xmp.dc_contributor == [] + + xmp.dc_creator = None + assert xmp.dc_creator == [] + + xmp.dc_title = None + assert xmp.dc_title == {} + + +def test_xmp_information_create_new_description(): + """Test creating new description elements (lines 462-465).""" + xmp = XmpInformation.create() + + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + + desc = xmp._get_or_create_description("test-uri") + assert desc.getAttributeNS(pypdf.xmp.RDF_NAMESPACE, "about") == "test-uri" + + assert desc.tagName == "rdf:Description" + assert desc.namespaceURI == pypdf.xmp.RDF_NAMESPACE + + +def test_xmp_information_attribute_handling(): + """Test attribute node removal and creation (line 479, 484, 506, 535, 564).""" + xmp = XmpInformation.create() + + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + + xmp.dc_coverage = "test coverage" + assert xmp.dc_coverage == "test coverage" + + xmp.dc_contributor = ["contributor1", "contributor2"] + assert xmp.dc_contributor == ["contributor1", "contributor2"] + + xmp.dc_creator = ["creator1", "creator2"] + assert xmp.dc_creator == ["creator1", "creator2"] + + xmp.dc_title = {"x-default": "Test Title", "en": "Test Title EN"} + assert xmp.dc_title == {"x-default": "Test Title", "en": "Test Title EN"} + + xmp.dc_format = "application/pdf" + assert xmp.dc_format == "application/pdf" + + xmp.dc_format = "text/plain" + assert xmp.dc_format == "text/plain" + + +def test_xmp_information_complete_coverage(): + xmp = XmpInformation.create() + + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + + desc = xmp._get_or_create_description() + desc.setAttribute("test", "value") + xmp.dc_source = "original" + xmp.dc_source = "modified" + assert xmp.dc_source == "modified" + + for desc in list(xmp.rdf_root.getElementsByTagNameNS(pypdf.xmp.RDF_NAMESPACE, "Description")): + xmp.rdf_root.removeChild(desc) + + xmp.dc_contributor = ["test1"] + xmp.dc_creator = ["test2"] + xmp.dc_title = {"x-default": "test3"} + + assert xmp.dc_contributor == ["test1"] + assert xmp.dc_creator == ["test2"] + assert xmp.dc_title == {"x-default": "test3"} + + +def test_xmp_information_attribute_removal(): + xmp = XmpInformation.create() + + desc = xmp._get_or_create_description() + desc.setAttributeNS(pypdf.xmp.DC_NAMESPACE, "dc:format", "application/pdf") + + xmp.dc_format = "text/plain" + assert xmp.dc_format == "text/plain"