|
| 1 | +# |
| 2 | +# Copyright IBM Corp. 2024 - 2024 |
| 3 | +# SPDX-License-Identifier: MIT |
| 4 | +# |
| 5 | + |
| 6 | +"""Simple metadata extractor module.""" |
| 7 | + |
| 8 | + |
| 9 | +from enum import Enum |
| 10 | +from typing import Any |
| 11 | + |
| 12 | +from docling_core.transforms.metadata_extractor import BaseMetadataExtractor |
| 13 | +from docling_core.types import Document as DLDocument |
| 14 | + |
| 15 | + |
| 16 | +class SimpleMetadataExtractor(BaseMetadataExtractor): |
| 17 | + """Simple metadata extractor class.""" |
| 18 | + |
| 19 | + class _Keys(str, Enum): |
| 20 | + DL_DOC_HASH = "dl_doc_hash" |
| 21 | + ORIGIN = "origin" |
| 22 | + |
| 23 | + include_origin: bool = False |
| 24 | + |
| 25 | + def get_metadata( |
| 26 | + self, doc: DLDocument, origin: str, *args: Any, **kwargs: Any |
| 27 | + ) -> dict[str, Any]: |
| 28 | + """Extract metadata for the given document. |
| 29 | +
|
| 30 | + Args: |
| 31 | + doc (DLDocument): document to extract metadata for |
| 32 | + origin (str): the document origin |
| 33 | +
|
| 34 | + Returns: |
| 35 | + dict[str, Any]: the extracted metadata |
| 36 | + """ |
| 37 | + meta: dict[str, Any] = { |
| 38 | + self._Keys.DL_DOC_HASH: doc.file_info.document_hash, |
| 39 | + } |
| 40 | + if self.include_origin: |
| 41 | + meta[self._Keys.ORIGIN] = origin |
| 42 | + return meta |
| 43 | + |
| 44 | + def get_excluded_embed_metadata_keys(self) -> list[str]: |
| 45 | + """Get metadata keys to exclude from embedding. |
| 46 | +
|
| 47 | + Returns: |
| 48 | + list[str]: the metadata to exclude |
| 49 | + """ |
| 50 | + excl_keys: list[str] = [self._Keys.DL_DOC_HASH] |
| 51 | + if self.include_origin: |
| 52 | + excl_keys.append(self._Keys.ORIGIN) |
| 53 | + return excl_keys |
| 54 | + |
| 55 | + def get_excluded_llm_metadata_keys(self) -> list[str]: |
| 56 | + """Get metadata keys to exclude from LLM generation. |
| 57 | +
|
| 58 | + Returns: |
| 59 | + list[str]: the metadata to exclude |
| 60 | + """ |
| 61 | + return self.get_excluded_embed_metadata_keys() |
0 commit comments