Skip to content

Commit 2b1a401

Browse files
authored
Merge branch 'main' into pprados/fix_password
2 parents 09f7c40 + 2f06d5a commit 2b1a401

File tree

8 files changed

+336
-7
lines changed

8 files changed

+336
-7
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ tags
190190
# Persistent undo
191191
[._]*.un~
192192

193-
.DS_Store
193+
*.DS_Store
194194

195195
# Ruff cache
196196
.ruff_cache/

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
## 0.16.10-dev0
2+
3+
### Enhancements
4+
5+
- **Enhance quote standardization tests with additional Unicode scenarios
6+
7+
### Features
8+
9+
### Fixes
10+
11+
- **Fix original file doctype detection** from cct converted file paths for metrics calculation.
12+
113
## 0.16.9
214

315
### Enhancements

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ test-extra-pypandoc:
198198
test-extra-xlsx:
199199
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/partition/test_xlsx.py
200200

201+
.PHONY: test-text-extraction-evaluate
202+
test-text-extraction-evaluate:
203+
PYTHONPATH=. CI=$(CI) ${PYTHON} -m pytest test_unstructured/metrics/test_text_extraction.py
204+
201205
## check: runs linters (includes tests)
202206
.PHONY: check
203207
check: check-ruff check-black check-flake8 check-version

test_unstructured/metrics/test_evaluate.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import pathlib
33
import shutil
44
from pathlib import Path
5+
from unittest.mock import MagicMock, patch
56

67
import numpy as np
78
import pandas as pd
@@ -57,6 +58,52 @@
5758
)
5859

5960

61+
@pytest.fixture
62+
def mock_dependencies():
63+
with patch(
64+
"unstructured.metrics.evaluate.calculate_accuracy"
65+
) as mock_calculate_accuracy, patch(
66+
"unstructured.metrics.evaluate.calculate_percent_missing_text"
67+
) as mock_calculate_percent_missing_text, patch.object(
68+
TextExtractionMetricsCalculator, "_get_ccts"
69+
) as mock_get_ccts, patch(
70+
"unstructured.metrics.evaluate.get_element_type_frequency"
71+
) as mock_get_element_type_frequency, patch(
72+
"unstructured.metrics.evaluate.calculate_element_type_percent_match"
73+
) as mock_calculate_element_type_percent_match, patch(
74+
"unstructured.metrics.evaluate._read_text_file"
75+
) as mock_read_text_file, patch.object(
76+
Path, "exists"
77+
) as mock_path_exists, patch(
78+
"unstructured.metrics.evaluate.TableEvalProcessor.from_json_files"
79+
) as mock_table_eval_processor_from_json_files, patch.object(
80+
TableStructureMetricsCalculator, "supported_metric_names"
81+
) as mock_supported_metric_names:
82+
mocks = {
83+
"mock_calculate_accuracy": mock_calculate_accuracy,
84+
"mock_calculate_percent_missing_text": mock_calculate_percent_missing_text,
85+
"mock_get_ccts": mock_get_ccts,
86+
"mock_get_element_type_frequency": mock_get_element_type_frequency,
87+
"mock_read_text_file": mock_read_text_file,
88+
"mock_calculate_element_type_percent_match": mock_calculate_element_type_percent_match,
89+
"mock_table_eval_processor_from_json_files": mock_table_eval_processor_from_json_files,
90+
"mock_supported_metric_names": mock_supported_metric_names,
91+
"mock_path_exists": mock_path_exists,
92+
}
93+
94+
# setup mocks
95+
mocks["mock_calculate_accuracy"].return_value = 0.5
96+
mocks["mock_calculate_percent_missing_text"].return_value = 0.5
97+
mocks["mock_get_ccts"].return_value = ["output_cct", "source_cct"]
98+
mocks["mock_get_element_type_frequency"].side_effect = [{"ele1": 1}, {"ele2": 3}]
99+
mocks["mock_calculate_element_type_percent_match"].return_value = 0.5
100+
mocks["mock_supported_metric_names"].return_value = ["table_level_acc"]
101+
mocks["mock_path_exists"].return_value = True
102+
mocks["mock_read_text_file"].side_effect = ["output_text", "source_text"]
103+
104+
yield mocks
105+
106+
60107
@pytest.fixture()
61108
def _cleanup_after_test():
62109
"""Fixture for removing side-effects of running tests in this file."""
@@ -139,6 +186,114 @@ def test_process_document_returns_the_correct_amount_of_values(
139186
assert len(output_list) == expected_length
140187

141188

189+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
190+
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
191+
@pytest.mark.parametrize(
192+
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
193+
[
194+
(
195+
TextExtractionMetricsCalculator,
196+
UNSTRUCTURED_CCT_DIRNAME,
197+
GOLD_CCT_DIRNAME,
198+
Path("2310.03502text_to_image_synthesis1-7.pdf.txt"),
199+
{"document_type": "txt"},
200+
),
201+
],
202+
)
203+
def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype(
204+
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
205+
):
206+
207+
output_dir = Path(TESTING_FILE_DIR) / output_dirname
208+
source_dir = Path(TESTING_FILE_DIR) / source_dirname
209+
mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"]
210+
mock_calculate_percent_missing_text = mock_dependencies["mock_calculate_percent_missing_text"]
211+
mock_get_ccts = mock_dependencies["mock_get_ccts"]
212+
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
213+
output_list = calculator._process_document(path)
214+
assert output_list[1] == ".pdf"
215+
assert mock_calculate_accuracy.call_count == 1
216+
assert mock_calculate_percent_missing_text.call_count == 1
217+
assert mock_get_ccts.call_count == 1
218+
219+
220+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
221+
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
222+
@pytest.mark.parametrize(
223+
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
224+
[
225+
(
226+
TableStructureMetricsCalculator,
227+
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
228+
GOLD_TABLE_STRUCTURE_DIRNAME,
229+
Path("tablib-627mTABLES-2310.07875-p7.pdf.json"),
230+
{},
231+
),
232+
# (
233+
# ElementTypeMetricsCalculator,
234+
# UNSTRUCTURED_OUTPUT_DIRNAME,
235+
# GOLD_ELEMENT_TYPE_DIRNAME,
236+
# Path("IRS-form.1987.pdf.json"),
237+
# {},
238+
# ),
239+
],
240+
)
241+
def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype(
242+
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
243+
):
244+
245+
output_dir = Path(TESTING_FILE_DIR) / output_dirname
246+
source_dir = Path(TESTING_FILE_DIR) / source_dirname
247+
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
248+
calculator._ground_truths_dir = source_dir
249+
calculator._documents_dir = output_dir
250+
calculator._ground_truth_paths = [source_dir / path]
251+
mock_report = MagicMock()
252+
mock_report.total_predicted_tables = 3
253+
mock_report.table_evel_acc = 0.83
254+
mock_table_eval_processor_from_json_files = mock_dependencies[
255+
"mock_table_eval_processor_from_json_files"
256+
]
257+
mock_table_eval_processor_from_json_files.return_value.process_file.return_value = mock_report
258+
259+
output_list = calculator._process_document(path)
260+
assert output_list[1] == ".pdf"
261+
assert mock_table_eval_processor_from_json_files.call_count == 1
262+
263+
264+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
265+
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
266+
@pytest.mark.parametrize(
267+
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
268+
[
269+
(
270+
ElementTypeMetricsCalculator,
271+
UNSTRUCTURED_OUTPUT_DIRNAME,
272+
GOLD_ELEMENT_TYPE_DIRNAME,
273+
Path("IRS-form.1987.pdf.json"),
274+
{},
275+
),
276+
],
277+
)
278+
def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype(
279+
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
280+
):
281+
282+
output_dir = Path(TESTING_FILE_DIR) / output_dirname
283+
source_dir = Path(TESTING_FILE_DIR) / source_dirname
284+
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
285+
mock_element_type_frequency = mock_dependencies["mock_get_element_type_frequency"]
286+
mock_read_text_file = mock_dependencies["mock_read_text_file"]
287+
mock_calculate_element_type_percent_match = mock_dependencies[
288+
"mock_calculate_element_type_percent_match"
289+
]
290+
output_list = calculator._process_document(path)
291+
assert output_list[1] == ".pdf"
292+
assert mock_read_text_file.call_count == 2
293+
assert mock_element_type_frequency.call_count == 2
294+
assert mock_calculate_element_type_percent_match.call_count == 1
295+
296+
142297
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
143298
@pytest.mark.usefixtures("_cleanup_after_test")
144299
def test_text_extraction_evaluation_type_txt():

test_unstructured/metrics/test_text_extraction.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,74 @@ def test_prepare_string(text, expected):
340340
assert text_extraction.prepare_str(text) == text
341341

342342

343+
@pytest.mark.parametrize(
344+
("input_text", "expected_output"),
345+
[
346+
# Mixed quotes in longer sentences
347+
(
348+
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
349+
"She said \"Hello\" and then whispered 'Goodbye' before leaving.",
350+
),
351+
# Double low-9 quotes with complex content
352+
(
353+
"„To be, or not to be, that is the question\" - Shakespeare's famous quote.",
354+
'"To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
355+
),
356+
# Angle quotes with nested quotes
357+
(
358+
'«When he said "life is beautiful," I believed him» wrote Maria.',
359+
'"When he said "life is beautiful," I believed him" wrote Maria.',
360+
),
361+
# Heavy ornament quotes in dialogue
362+
(
363+
"❝Do you remember when we first met?❞ she asked with a smile.",
364+
'"Do you remember when we first met?" she asked with a smile.',
365+
),
366+
# Double prime quotes with punctuation
367+
(
368+
"〝The meeting starts at 10:00, don't be late!〟 announced the manager.",
369+
'"The meeting starts at 10:00, don\'t be late!" announced the manager.',
370+
),
371+
# Corner brackets with nested quotes
372+
(
373+
'「He told me "This is important" yesterday」, she explained.',
374+
"'He told me \"This is important\" yesterday', she explained.",
375+
),
376+
# White corner brackets with multiple sentences
377+
(
378+
"『The sun was setting. The birds were singing. It was peaceful.』",
379+
"'The sun was setting. The birds were singing. It was peaceful.'",
380+
),
381+
# Vertical corner brackets with numbers and special characters
382+
("﹂Meeting #123 @ 15:00 - Don't forget!﹁", "'Meeting #123 @ 15:00 - Don't forget!'"),
383+
# Complex mixed quote types
384+
(
385+
'「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
386+
'\'Hello\', "World", "Test", \'Example\', "Quote", "Final"',
387+
),
388+
# Quotes with multiple apostrophes
389+
("It's John's book, isn't it?", "It's John's book, isn't it?"),
390+
# Single angle quotes with nested content
391+
(
392+
'‹Testing the system\'s capability for "quoted" text›',
393+
"'Testing the system's capability for \"quoted\" text'",
394+
),
395+
# Heavy single ornament quotes with multiple sentences
396+
(
397+
"❛First sentence. Second sentence. Third sentence.❜",
398+
"'First sentence. Second sentence. Third sentence.'",
399+
),
400+
# Mix of various quote types in complex text
401+
(
402+
'「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
403+
'\'Chapter 1\': "The Beginning" - "A new story" begins "today".',
404+
),
405+
],
406+
)
407+
def test_standardize_quotes(input_text, expected_output):
408+
assert text_extraction.standardize_quotes(input_text) == expected_output
409+
410+
343411
@pytest.mark.parametrize(
344412
("output_text", "source_text", "expected_percentage"),
345413
[

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.9" # pragma: no cover
1+
__version__ = "0.16.10-dev0" # pragma: no cover

unstructured/metrics/evaluate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def default_agg_tsv_name(self):
248248
def _process_document(self, doc: Path) -> Optional[list]:
249249
doc_path = Path(doc)
250250
out_filename = doc_path.stem
251-
doctype = Path(out_filename).suffix[1:]
251+
doctype = Path(out_filename).suffix
252252
src_gt_filename = out_filename + ".json"
253253
connector = doc_path.parts[-2] if len(doc_path.parts) > 1 else None
254254

@@ -407,7 +407,7 @@ def _validate_inputs(self):
407407

408408
def _process_document(self, doc: Path) -> Optional[list]:
409409
filename = doc.stem
410-
doctype = doc.suffixes[0]
410+
doctype = doc.suffixes[-2]
411411
connector = doc.parts[0] if len(doc.parts) > 1 else None
412412

413413
output_cct, source_cct = self._get_ccts(doc)
@@ -482,7 +482,7 @@ def default_agg_tsv_name(self) -> str:
482482

483483
def _process_document(self, doc: Path) -> Optional[list]:
484484
filename = doc.stem
485-
doctype = doc.suffixes[0]
485+
doctype = doc.suffixes[-2]
486486
connector = doc.parts[0] if len(doc.parts) > 1 else None
487487

488488
output = get_element_type_frequency(_read_text_file(self.documents_dir / doc))

0 commit comments

Comments
 (0)