feat: adding function to detect unmapped CID characters in PDFMinerToDocument (#8992)

davidsbatista · web-flow · commit c037052581a1 · 2025-03-06T15:44:06.000Z
* adding function to detect unmapped CID characters

* adding release notes

* adding test for logs
diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py
@@ -4,6 +4,7 @@
 
 import io
 import os
+import re
 from pathlib import Path
 from typing import Any, Dict, Iterator, List, Optional, Union
 
@@ -18,6 +19,8 @@
 
 logger = logging.getLogger(__name__)
 
+CID_PATTERN = r"\(cid:\d+\)"  # regex pattern to detect CID characters
+
 
 @component
 class PDFMinerToDocument:
@@ -97,6 +100,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
             all_texts=all_texts,
         )
         self.store_full_path = store_full_path
+        self.cid_pattern = re.compile(CID_PATTERN)
 
     @staticmethod
     def _converter(lt_page_objs: Iterator) -> str:
@@ -126,6 +130,32 @@ def _converter(lt_page_objs: Iterator) -> str:
 
         return delimited_pages
 
+    def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]:
+        """
+        Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format.
+
+        This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses
+        non-standard fonts.
+
+        A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like
+        searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor
+        needs. If that map is not available the text extractor cannot decode the CID characters and will return them
+        as is.
+
+        see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output
+
+        :param: text: The text to check for undecoded CID characters
+        :returns:
+            A dictionary containing detection results
+        """
+
+        matches = re.findall(self.cid_pattern, text)
+        total_chars = len(text)
+        cid_chars = sum(len(match) for match in matches)
+        percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0
+
+        return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)}
+
     @component.output_types(documents=List[Document])
     def run(
         self,
@@ -178,6 +208,19 @@ def run(
 
             if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                 merged_metadata["file_path"] = os.path.basename(file_path)
+
+            analysis = self.detect_undecoded_cid_characters(text)
+
+            if analysis["percentage"] > 0:
+                logger.warning(
+                    "Detected {cid_chars} undecoded CID characters in {total_chars} characters"
+                    " ({percentage}%) in {source}.",
+                    cid_chars=analysis["cid_chars"],
+                    total_chars=analysis["total_chars"],
+                    percentage=analysis["percentage"],
+                    source=source,
+                )
+
             document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
diff --git a/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml b/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential
+    text extraction quality issues when processing PDFs with non-standard fonts.
diff --git a/test/components/converters/test_pdfminer_to_document.py b/test/components/converters/test_pdfminer_to_document.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from unittest.mock import patch
 
 import pytest
 
@@ -185,3 +186,56 @@ def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path
             "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
         )
         assert docs["documents"][6].content == expected
+
+    def test_detect_undecoded_cid_characters(self):
+        """
+        Test if the component correctly detects and reports undecoded CID characters in text.
+        """
+        converter = PDFMinerToDocument()
+
+        # Test text with no CID characters
+        text = "This is a normal text without any CID characters."
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == len(text)
+        assert result["cid_chars"] == 0
+        assert result["percentage"] == 0
+
+        # Test text with CID characters
+        text = "Some text with (cid:123) and (cid:456) characters"
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == len(text)
+        assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)")  # 18 characters total
+        assert result["percentage"] == round((18 / len(text)) * 100, 2)
+
+        # Test text with multiple consecutive CID characters
+        text = "(cid:123)(cid:456)(cid:789)"
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == len(text)
+        assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)")
+        assert result["percentage"] == 100.0
+
+        # Test empty text
+        text = ""
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == 0
+        assert result["cid_chars"] == 0
+        assert result["percentage"] == 0
+
+    def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch):
+        """
+        Test if the component correctly logs a warning when undecoded CID characters are detected.
+        """
+        test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"})
+
+        def mock_converter(*args, **kwargs):
+            return "This is text with (cid:123) and (cid:456) characters"
+
+        def mock_extract_pages(*args, **kwargs):
+            return ["mocked page"]
+
+        with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages):
+            with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter):
+                with caplog.at_level(logging.WARNING):
+                    converter = PDFMinerToDocument()
+                    converter.run(sources=[test_data])
+                    assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text