Skip to content

Commit c037052

Browse files
feat: adding function to detect unmapped CID characters in PDFMinerToDocument (#8992)
* adding function to detect unmapped CID characters * adding release notes * adding test for logs
1 parent 4c9d08a commit c037052

File tree

3 files changed

+102
-0
lines changed

3 files changed

+102
-0
lines changed

haystack/components/converters/pdfminer.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import io
66
import os
7+
import re
78
from pathlib import Path
89
from typing import Any, Dict, Iterator, List, Optional, Union
910

@@ -18,6 +19,8 @@
1819

1920
logger = logging.getLogger(__name__)
2021

22+
CID_PATTERN = r"\(cid:\d+\)" # regex pattern to detect CID characters
23+
2124

2225
@component
2326
class PDFMinerToDocument:
@@ -97,6 +100,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
97100
all_texts=all_texts,
98101
)
99102
self.store_full_path = store_full_path
103+
self.cid_pattern = re.compile(CID_PATTERN)
100104

101105
@staticmethod
102106
def _converter(lt_page_objs: Iterator) -> str:
@@ -126,6 +130,32 @@ def _converter(lt_page_objs: Iterator) -> str:
126130

127131
return delimited_pages
128132

133+
def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]:
134+
"""
135+
Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format.
136+
137+
This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses
138+
non-standard fonts.
139+
140+
A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like
141+
searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor
142+
needs. If that map is not available the text extractor cannot decode the CID characters and will return them
143+
as is.
144+
145+
see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output
146+
147+
:param: text: The text to check for undecoded CID characters
148+
:returns:
149+
A dictionary containing detection results
150+
"""
151+
152+
matches = re.findall(self.cid_pattern, text)
153+
total_chars = len(text)
154+
cid_chars = sum(len(match) for match in matches)
155+
percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0
156+
157+
return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)}
158+
129159
@component.output_types(documents=List[Document])
130160
def run(
131161
self,
@@ -178,6 +208,19 @@ def run(
178208

179209
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
180210
merged_metadata["file_path"] = os.path.basename(file_path)
211+
212+
analysis = self.detect_undecoded_cid_characters(text)
213+
214+
if analysis["percentage"] > 0:
215+
logger.warning(
216+
"Detected {cid_chars} undecoded CID characters in {total_chars} characters"
217+
" ({percentage}%) in {source}.",
218+
cid_chars=analysis["cid_chars"],
219+
total_chars=analysis["total_chars"],
220+
percentage=analysis["percentage"],
221+
source=source,
222+
)
223+
181224
document = Document(content=text, meta=merged_metadata)
182225
documents.append(document)
183226

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
enhancements:
3+
- |
4+
Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential
5+
text extraction quality issues when processing PDFs with non-standard fonts.

test/components/converters/test_pdfminer_to_document.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44
import logging
5+
from unittest.mock import patch
56

67
import pytest
78

@@ -185,3 +186,56 @@ def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path
185186
"structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
186187
)
187188
assert docs["documents"][6].content == expected
189+
190+
def test_detect_undecoded_cid_characters(self):
191+
"""
192+
Test if the component correctly detects and reports undecoded CID characters in text.
193+
"""
194+
converter = PDFMinerToDocument()
195+
196+
# Test text with no CID characters
197+
text = "This is a normal text without any CID characters."
198+
result = converter.detect_undecoded_cid_characters(text)
199+
assert result["total_chars"] == len(text)
200+
assert result["cid_chars"] == 0
201+
assert result["percentage"] == 0
202+
203+
# Test text with CID characters
204+
text = "Some text with (cid:123) and (cid:456) characters"
205+
result = converter.detect_undecoded_cid_characters(text)
206+
assert result["total_chars"] == len(text)
207+
assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)") # 18 characters total
208+
assert result["percentage"] == round((18 / len(text)) * 100, 2)
209+
210+
# Test text with multiple consecutive CID characters
211+
text = "(cid:123)(cid:456)(cid:789)"
212+
result = converter.detect_undecoded_cid_characters(text)
213+
assert result["total_chars"] == len(text)
214+
assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)")
215+
assert result["percentage"] == 100.0
216+
217+
# Test empty text
218+
text = ""
219+
result = converter.detect_undecoded_cid_characters(text)
220+
assert result["total_chars"] == 0
221+
assert result["cid_chars"] == 0
222+
assert result["percentage"] == 0
223+
224+
def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch):
225+
"""
226+
Test if the component correctly logs a warning when undecoded CID characters are detected.
227+
"""
228+
test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"})
229+
230+
def mock_converter(*args, **kwargs):
231+
return "This is text with (cid:123) and (cid:456) characters"
232+
233+
def mock_extract_pages(*args, **kwargs):
234+
return ["mocked page"]
235+
236+
with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages):
237+
with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter):
238+
with caplog.at_level(logging.WARNING):
239+
converter = PDFMinerToDocument()
240+
converter.run(sources=[test_data])
241+
assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text

0 commit comments

Comments
 (0)