diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 491330b36..d6f52c33e 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -38,6 +38,10 @@ def unload(self): def supported_formats(cls) -> Set["InputFormat"]: pass + @abstractmethod + def extract_metadata(self) -> Dict[str, Any]: + return {} + class PaginatedDocumentBackend(AbstractDocumentBackend): """DeclarativeDocumentBackend. diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index 268b80ade..25099a036 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -147,6 +147,10 @@ def convert(self) -> DoclingDocument: f"Cannot convert doc (hash={self.document_hash}, " f"name={self.file.name}) because the backend failed to init." ) + + @override + def extract_metadata(self) -> Dict[str, Any]: + return {} class PatentUspto(ABC): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 7955ff9df..b3cce5d55 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -207,6 +207,7 @@ class ConversionResult(BaseModel): confidence: ConfidenceReport = Field(default_factory=ConfidenceReport) document: DoclingDocument = _EMPTY_DOCLING_DOC + metadata: Dict[str, Any] = {} @property @deprecated("Use document instead.") diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index 0e3f1b6f9..7b12dfd6b 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -38,6 +38,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: # a DoclingDocument straight. with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): conv_res.document = conv_res.input._backend.convert() + conv_res.metadata = conv_res.input._backend.extract_metadata() return conv_res def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: