Skip to content

Commit fd8b682

Browse files
authored
fix: mean group add param (#2684)
1 parent bdfd975 commit fd8b682

File tree

6 files changed

+43
-20
lines changed

6 files changed

+43
-20
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
## 0.13.0-dev10
22

3-
### Enhancements
3+
### Enhancements
44

55
* **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks.
66
* **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores

examples/chroma-news-of-the-day/news-of-the-day.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,9 @@
187187
"metadata": {},
188188
"outputs": [],
189189
"source": [
190-
"query_docs = vectorstore.similarity_search(\"What is behind the rapid increase in car insurance rates?\", k=1)"
190+
"query_docs = vectorstore.similarity_search(\n",
191+
" \"What is behind the rapid increase in car insurance rates?\", k=1\n",
192+
")"
191193
]
192194
},
193195
{

examples/winter-sports/winter_sports.ipynb

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
"source": [
1010
"# Warning control\n",
1111
"import warnings\n",
12-
"warnings.filterwarnings('ignore')"
12+
"\n",
13+
"warnings.filterwarnings(\"ignore\")"
1314
]
1415
},
1516
{
@@ -67,7 +68,8 @@
6768
],
6869
"source": [
6970
"from IPython.display import Image\n",
70-
"Image(filename='images/winter-sports-cover.png', height=400, width=400) "
71+
"\n",
72+
"Image(filename=\"images/winter-sports-cover.png\", height=400, width=400)"
7173
]
7274
},
7375
{
@@ -80,7 +82,7 @@
8082
"filename = \"example_files/winter-sports.epub\"\n",
8183
"\n",
8284
"with open(filename, \"rb\") as f:\n",
83-
" files=shared.Files(\n",
85+
" files = shared.Files(\n",
8486
" content=f.read(),\n",
8587
" file_name=filename,\n",
8688
" )\n",
@@ -174,6 +176,7 @@
174176
"outputs": [],
175177
"source": [
176178
"from unstructured.staging.base import dict_to_elements\n",
179+
"\n",
177180
"elements = dict_to_elements(resp.elements)"
178181
]
179182
},
@@ -303,10 +306,9 @@
303306
}
304307
],
305308
"source": [
306-
"qa_chain.invoke({\n",
307-
" \"question\": \"What is the most popular winter sport in Switzerland?\",\n",
308-
" \"chat_history\": []\n",
309-
"})[\"answer\"]"
309+
"qa_chain.invoke(\n",
310+
" {\"question\": \"What is the most popular winter sport in Switzerland?\", \"chat_history\": []}\n",
311+
")[\"answer\"]"
310312
]
311313
},
312314
{

test_unstructured/metrics/test_evaluate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def test_get_mean_grouping_all_file():
324324
data_input=filtered_df,
325325
export_dir=export_dir,
326326
eval_name="text_extraction",
327-
export_name="two-filename-agg-cct.tsv",
327+
export_filename="two-filename-agg-cct.tsv",
328328
)
329329
grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
330330

@@ -356,7 +356,7 @@ def test_get_mean_grouping_all_file_txt():
356356
data_input=filtered_df,
357357
export_dir=export_dir,
358358
eval_name="text_extraction",
359-
export_name="two-filename-agg-cct.tsv",
359+
export_filename="two-filename-agg-cct.tsv",
360360
)
361361
grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
362362

unstructured/ingest/evaluate.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,30 @@ def measure_element_type_accuracy_command(
159159
help="Evaluated metric. Expecting one of 'text_extraction' or 'element_type'",
160160
)
161161
@click.option(
162-
"--export_name", type=str, help="Optional. Define your file name for the output here."
162+
"--agg_name",
163+
type=str,
164+
help="String to use with export filename. Default is `cct` for `text_extraction` \
165+
and `element-type` for `element_type`",
166+
)
167+
@click.option(
168+
"--export_filename", type=str, help="Optional. Define your file name for the output here."
163169
)
164170
def get_mean_grouping_command(
165-
group_by: str, data_input: str, export_dir: str, eval_name: str, export_name: str
171+
group_by: str,
172+
data_input: str,
173+
export_dir: str,
174+
eval_name: str,
175+
agg_name: Optional[str] = None,
176+
export_filename: Optional[str] = None,
166177
):
167-
return get_mean_grouping(group_by, data_input, export_dir, eval_name, export_name)
178+
return get_mean_grouping(
179+
group_by=group_by,
180+
data_input=data_input,
181+
export_dir=export_dir,
182+
eval_name=eval_name,
183+
agg_name=agg_name,
184+
export_filename=export_filename,
185+
)
168186

169187

170188
@main.command()

unstructured/metrics/evaluate.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def get_mean_grouping(
202202
export_dir: str,
203203
eval_name: str,
204204
agg_name: Optional[str] = None,
205-
export_name: Optional[str] = None,
205+
export_filename: Optional[str] = None,
206206
) -> None:
207207
"""Aggregates accuracy and missing metrics by column name 'doctype' or 'connector',
208208
or 'all' for all rows. Export to TSV.
@@ -274,10 +274,10 @@ def get_mean_grouping(
274274
if "grouping_key" in grouped_df.columns.get_level_values(0):
275275
grouped_df = grouped_df.drop("grouping_key", axis=1, level=0)
276276

277-
if export_name:
278-
if not export_name.endswith(".tsv"):
279-
export_name = export_name + ".tsv"
280-
_write_to_file(export_dir, export_name, grouped_df)
277+
if export_filename:
278+
if not export_filename.endswith(".tsv"):
279+
export_filename = export_filename + ".tsv"
280+
_write_to_file(export_dir, export_filename, grouped_df)
281281
else:
282282
_write_to_file(export_dir, f"all-{group_by}-agg-{agg_name}.tsv", grouped_df)
283283

@@ -299,6 +299,7 @@ def measure_table_structure_accuracy(
299299
300300
Calculates:
301301
- table found accuracy
302+
- table level accuracy
302303
- element in column index accuracy
303304
- element in row index accuracy
304305
- element's column content accuracy
@@ -315,7 +316,7 @@ def measure_table_structure_accuracy(
315316
for doc in tqdm(output_list, leave=False, disable=not visualize): # type: ignore
316317
doc_path = Path(doc)
317318
out_filename = doc_path.stem
318-
doctype = Path(out_filename).suffix
319+
doctype = Path(out_filename).suffix[1:]
319320
src_gt_filename = out_filename + ".json"
320321
connector = doc_path.parts[-2] if len(doc_path.parts) > 1 else None
321322

0 commit comments

Comments
 (0)