fix: mean group add param (#2684)

Klaijan · web-flow · commit fd8b68219429 · 2024-03-22T15:16:23.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ## 0.13.0-dev10
 
-### Enhancements
+### Enhancements 
 
 * **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks.
 * **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores
diff --git a/examples/chroma-news-of-the-day/news-of-the-day.ipynb b/examples/chroma-news-of-the-day/news-of-the-day.ipynb
@@ -187,7 +187,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "query_docs = vectorstore.similarity_search(\"What is behind the rapid increase in car insurance rates?\", k=1)"
+    "query_docs = vectorstore.similarity_search(\n",
+    "    \"What is behind the rapid increase in car insurance rates?\", k=1\n",
+    ")"
    ]
   },
   {
diff --git a/examples/winter-sports/winter_sports.ipynb b/examples/winter-sports/winter_sports.ipynb
@@ -9,7 +9,8 @@
    "source": [
     "# Warning control\n",
     "import warnings\n",
-    "warnings.filterwarnings('ignore')"
+    "\n",
+    "warnings.filterwarnings(\"ignore\")"
    ]
   },
   {
@@ -67,7 +68,8 @@
    ],
    "source": [
     "from IPython.display import Image\n",
-    "Image(filename='images/winter-sports-cover.png', height=400, width=400) "
+    "\n",
+    "Image(filename=\"images/winter-sports-cover.png\", height=400, width=400)"
    ]
   },
   {
@@ -80,7 +82,7 @@
     "filename = \"example_files/winter-sports.epub\"\n",
     "\n",
     "with open(filename, \"rb\") as f:\n",
-    "    files=shared.Files(\n",
+    "    files = shared.Files(\n",
     "        content=f.read(),\n",
     "        file_name=filename,\n",
     "    )\n",
@@ -174,6 +176,7 @@
    "outputs": [],
    "source": [
     "from unstructured.staging.base import dict_to_elements\n",
+    "\n",
     "elements = dict_to_elements(resp.elements)"
    ]
   },
@@ -303,10 +306,9 @@
     }
    ],
    "source": [
-    "qa_chain.invoke({\n",
-    "    \"question\": \"What is the most popular winter sport in Switzerland?\",\n",
-    "    \"chat_history\": []\n",
-    "})[\"answer\"]"
+    "qa_chain.invoke(\n",
+    "    {\"question\": \"What is the most popular winter sport in Switzerland?\", \"chat_history\": []}\n",
+    ")[\"answer\"]"
    ]
   },
   {
diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py
@@ -324,7 +324,7 @@ def test_get_mean_grouping_all_file():
         data_input=filtered_df,
         export_dir=export_dir,
         eval_name="text_extraction",
-        export_name="two-filename-agg-cct.tsv",
+        export_filename="two-filename-agg-cct.tsv",
     )
     grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
 
@@ -356,7 +356,7 @@ def test_get_mean_grouping_all_file_txt():
         data_input=filtered_df,
         export_dir=export_dir,
         eval_name="text_extraction",
-        export_name="two-filename-agg-cct.tsv",
+        export_filename="two-filename-agg-cct.tsv",
     )
     grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
 
diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py
@@ -159,12 +159,30 @@ def measure_element_type_accuracy_command(
     help="Evaluated metric. Expecting one of 'text_extraction' or 'element_type'",
 )
 @click.option(
-    "--export_name", type=str, help="Optional. Define your file name for the output here."
+    "--agg_name",
+    type=str,
+    help="String to use with export filename. Default is `cct` for `text_extraction` \
+        and `element-type` for `element_type`",
+)
+@click.option(
+    "--export_filename", type=str, help="Optional. Define your file name for the output here."
 )
 def get_mean_grouping_command(
-    group_by: str, data_input: str, export_dir: str, eval_name: str, export_name: str
+    group_by: str,
+    data_input: str,
+    export_dir: str,
+    eval_name: str,
+    agg_name: Optional[str] = None,
+    export_filename: Optional[str] = None,
 ):
-    return get_mean_grouping(group_by, data_input, export_dir, eval_name, export_name)
+    return get_mean_grouping(
+        group_by=group_by,
+        data_input=data_input,
+        export_dir=export_dir,
+        eval_name=eval_name,
+        agg_name=agg_name,
+        export_filename=export_filename,
+    )
 
 
 @main.command()
diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py
@@ -202,7 +202,7 @@ def get_mean_grouping(
     export_dir: str,
     eval_name: str,
     agg_name: Optional[str] = None,
-    export_name: Optional[str] = None,
+    export_filename: Optional[str] = None,
 ) -> None:
     """Aggregates accuracy and missing metrics by column name 'doctype' or 'connector',
     or 'all' for all rows. Export to TSV.
@@ -274,10 +274,10 @@ def get_mean_grouping(
     if "grouping_key" in grouped_df.columns.get_level_values(0):
         grouped_df = grouped_df.drop("grouping_key", axis=1, level=0)
 
-    if export_name:
-        if not export_name.endswith(".tsv"):
-            export_name = export_name + ".tsv"
-        _write_to_file(export_dir, export_name, grouped_df)
+    if export_filename:
+        if not export_filename.endswith(".tsv"):
+            export_filename = export_filename + ".tsv"
+        _write_to_file(export_dir, export_filename, grouped_df)
     else:
         _write_to_file(export_dir, f"all-{group_by}-agg-{agg_name}.tsv", grouped_df)
 
@@ -299,6 +299,7 @@ def measure_table_structure_accuracy(
 
     Calculates:
         - table found accuracy
+        - table level accuracy
         - element in column index accuracy
         - element in row index accuracy
         - element's column content accuracy
@@ -315,7 +316,7 @@ def measure_table_structure_accuracy(
     for doc in tqdm(output_list, leave=False, disable=not visualize):  # type: ignore
         doc_path = Path(doc)
         out_filename = doc_path.stem
-        doctype = Path(out_filename).suffix
+        doctype = Path(out_filename).suffix[1:]
         src_gt_filename = out_filename + ".json"
         connector = doc_path.parts[-2] if len(doc_path.parts) > 1 else None
 

Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,9 @@`
`187`	`187`	`"metadata": {},`
`188`	`188`	`"outputs": [],`
`189`	`189`	`"source": [`
`190`		`- "query_docs = vectorstore.similarity_search(\"What is behind the rapid increase in car insurance rates?\", k=1)"`
	`190`	`+ "query_docs = vectorstore.similarity_search(\n",`
	`191`	`+ " \"What is behind the rapid increase in car insurance rates?\", k=1\n",`
	`192`	`+ ")"`
`191`	`193`	`]`
`192`	`194`	`},`
`193`	`195`	`{`