remove unused _with_spans metric (#3342)

plutasnyy · web-flow · commit 609a08a95f93 · 2024-07-08T16:59:53.000Z
The table metrics considering spans is not used and it messes with the
output thus I have cleaned the code from it. Though, I have left
table_as_cells in the source code - it still may be useful for the users
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.14.10-dev11
+## 0.14.10-dev12
 
 ### Enhancements
 
 * **Update unstructured-client dependency** Change unstructured-client dependency pin back to
   greater than min version and updated tests that were failing given the update.
 * **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
 * Add table detection metrics: recall, precision and f1
+* Remove unused _with_spans metrics
 
 ### Features
 
diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py
@@ -115,7 +115,7 @@ def test_text_extraction_evaluation():
             UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
             GOLD_TABLE_STRUCTURE_DIRNAME,
             Path("IRS-2023-Form-1095-A.pdf.json"),
-            23,
+            13,
             {},
         ),
         (
@@ -191,7 +191,7 @@ def test_table_structure_evaluation():
     assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
     df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
     assert len(df) == 1
-    assert len(df.columns) == 23
+    assert len(df.columns) == 13
     assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"
 
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.10-dev11"  # pragma: no cover
+__version__ = "0.14.10-dev12"  # pragma: no cover
diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py
@@ -229,35 +229,18 @@ def _process_document(self, doc: Path) -> list:
             source_type="html",
         )
         report_from_html = processor_from_text_as_html.process_file()
-
-        processor_from_table_as_cells = TableEvalProcessor.from_json_files(
-            prediction_file=prediction_file,
-            ground_truth_file=ground_truth_file,
-            cutoff=self.cutoff,
-            source_type="cells",
-        )
-        report_from_cells = processor_from_table_as_cells.process_file()
-        return (
-            [
-                out_filename,
-                doctype,
-                connector,
-            ]
-            + [getattr(report_from_html, metric) for metric in self.supported_metric_names]
-            + [getattr(report_from_cells, metric) for metric in self.supported_metric_names]
-        )
+        return [
+            out_filename,
+            doctype,
+            connector,
+        ] + [getattr(report_from_html, metric) for metric in self.supported_metric_names]
 
     def _generate_dataframes(self, rows):
-        # NOTE(mike): this logic should be simplified
-        suffixed_table_eval_metrics = [
-            f"{metric}_with_spans" for metric in self.supported_metric_names
-        ]
-        combined_table_metrics = self.supported_metric_names + suffixed_table_eval_metrics
         headers = [
             "filename",
             "doctype",
             "connector",
-        ] + combined_table_metrics
+        ] + self.supported_metric_names
 
         df = pd.DataFrame(rows, columns=headers)
         has_tables_df = df[df["total_tables"] > 0]
@@ -268,7 +251,7 @@ def _generate_dataframes(self, rows):
             ).reset_index()
         else:
             element_metrics_results = {}
-            for metric in combined_table_metrics:
+            for metric in self.supported_metric_names:
                 metric_df = has_tables_df[has_tables_df[metric].notnull()]
                 agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
                 if agg_metric.empty:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.14.10-dev11" # pragma: no cover`
	`1`	`+__version__ = "0.14.10-dev12" # pragma: no cover`