|
2 | 2 | # |
3 | 3 | # SPDX-License-Identifier: Apache-2.0 |
4 | 4 | import logging |
| 5 | +from unittest.mock import patch |
5 | 6 |
|
6 | 7 | import pytest |
7 | 8 |
|
@@ -185,3 +186,56 @@ def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path |
185 | 186 | "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n" |
186 | 187 | ) |
187 | 188 | assert docs["documents"][6].content == expected |
| 189 | + |
| 190 | + def test_detect_undecoded_cid_characters(self): |
| 191 | + """ |
| 192 | + Test if the component correctly detects and reports undecoded CID characters in text. |
| 193 | + """ |
| 194 | + converter = PDFMinerToDocument() |
| 195 | + |
| 196 | + # Test text with no CID characters |
| 197 | + text = "This is a normal text without any CID characters." |
| 198 | + result = converter.detect_undecoded_cid_characters(text) |
| 199 | + assert result["total_chars"] == len(text) |
| 200 | + assert result["cid_chars"] == 0 |
| 201 | + assert result["percentage"] == 0 |
| 202 | + |
| 203 | + # Test text with CID characters |
| 204 | + text = "Some text with (cid:123) and (cid:456) characters" |
| 205 | + result = converter.detect_undecoded_cid_characters(text) |
| 206 | + assert result["total_chars"] == len(text) |
| 207 | + assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)") # 18 characters total |
| 208 | + assert result["percentage"] == round((18 / len(text)) * 100, 2) |
| 209 | + |
| 210 | + # Test text with multiple consecutive CID characters |
| 211 | + text = "(cid:123)(cid:456)(cid:789)" |
| 212 | + result = converter.detect_undecoded_cid_characters(text) |
| 213 | + assert result["total_chars"] == len(text) |
| 214 | + assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)") |
| 215 | + assert result["percentage"] == 100.0 |
| 216 | + |
| 217 | + # Test empty text |
| 218 | + text = "" |
| 219 | + result = converter.detect_undecoded_cid_characters(text) |
| 220 | + assert result["total_chars"] == 0 |
| 221 | + assert result["cid_chars"] == 0 |
| 222 | + assert result["percentage"] == 0 |
| 223 | + |
| 224 | + def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch): |
| 225 | + """ |
| 226 | + Test if the component correctly logs a warning when undecoded CID characters are detected. |
| 227 | + """ |
| 228 | + test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"}) |
| 229 | + |
| 230 | + def mock_converter(*args, **kwargs): |
| 231 | + return "This is text with (cid:123) and (cid:456) characters" |
| 232 | + |
| 233 | + def mock_extract_pages(*args, **kwargs): |
| 234 | + return ["mocked page"] |
| 235 | + |
| 236 | + with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages): |
| 237 | + with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter): |
| 238 | + with caplog.at_level(logging.WARNING): |
| 239 | + converter = PDFMinerToDocument() |
| 240 | + converter.run(sources=[test_data]) |
| 241 | + assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text |
0 commit comments