Skip to content

Commit 609a08a

Browse files
authored
remove unused _with_spans metric (#3342)
The table metrics considering spans is not used and it messes with the output thus I have cleaned the code from it. Though, I have left table_as_cells in the source code - it still may be useful for the users
1 parent caea73c commit 609a08a

File tree

4 files changed

+12
-28
lines changed

4 files changed

+12
-28
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.14.10-dev11
1+
## 0.14.10-dev12
22

33
### Enhancements
44

55
* **Update unstructured-client dependency** Change unstructured-client dependency pin back to
66
greater than min version and updated tests that were failing given the update.
77
* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
88
* Add table detection metrics: recall, precision and f1
9+
* Remove unused _with_spans metrics
910

1011
### Features
1112

test_unstructured/metrics/test_evaluate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def test_text_extraction_evaluation():
115115
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
116116
GOLD_TABLE_STRUCTURE_DIRNAME,
117117
Path("IRS-2023-Form-1095-A.pdf.json"),
118-
23,
118+
13,
119119
{},
120120
),
121121
(
@@ -191,7 +191,7 @@ def test_table_structure_evaluation():
191191
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
192192
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
193193
assert len(df) == 1
194-
assert len(df.columns) == 23
194+
assert len(df.columns) == 13
195195
assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"
196196

197197

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.10-dev11" # pragma: no cover
1+
__version__ = "0.14.10-dev12" # pragma: no cover

unstructured/metrics/evaluate.py

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -229,35 +229,18 @@ def _process_document(self, doc: Path) -> list:
229229
source_type="html",
230230
)
231231
report_from_html = processor_from_text_as_html.process_file()
232-
233-
processor_from_table_as_cells = TableEvalProcessor.from_json_files(
234-
prediction_file=prediction_file,
235-
ground_truth_file=ground_truth_file,
236-
cutoff=self.cutoff,
237-
source_type="cells",
238-
)
239-
report_from_cells = processor_from_table_as_cells.process_file()
240-
return (
241-
[
242-
out_filename,
243-
doctype,
244-
connector,
245-
]
246-
+ [getattr(report_from_html, metric) for metric in self.supported_metric_names]
247-
+ [getattr(report_from_cells, metric) for metric in self.supported_metric_names]
248-
)
232+
return [
233+
out_filename,
234+
doctype,
235+
connector,
236+
] + [getattr(report_from_html, metric) for metric in self.supported_metric_names]
249237

250238
def _generate_dataframes(self, rows):
251-
# NOTE(mike): this logic should be simplified
252-
suffixed_table_eval_metrics = [
253-
f"{metric}_with_spans" for metric in self.supported_metric_names
254-
]
255-
combined_table_metrics = self.supported_metric_names + suffixed_table_eval_metrics
256239
headers = [
257240
"filename",
258241
"doctype",
259242
"connector",
260-
] + combined_table_metrics
243+
] + self.supported_metric_names
261244

262245
df = pd.DataFrame(rows, columns=headers)
263246
has_tables_df = df[df["total_tables"] > 0]
@@ -268,7 +251,7 @@ def _generate_dataframes(self, rows):
268251
).reset_index()
269252
else:
270253
element_metrics_results = {}
271-
for metric in combined_table_metrics:
254+
for metric in self.supported_metric_names:
272255
metric_df = has_tables_df[has_tables_df[metric].notnull()]
273256
agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
274257
if agg_metric.empty:

0 commit comments

Comments
 (0)