Skip to content

Commit 29e64eb

Browse files
feat: table evaluations for fixed html table generation (#3196)
Update to the evaluation script to handle correct HTML syntax for tables. See Unstructured-IO/unstructured-inference#355 for details. This change: - modifies transforming HTML tables to evaluation internal `cells` format - fixes the indexing of the output (internal format cells) when HTML cells use spans
1 parent dadc9c6 commit 29e64eb

File tree

5 files changed

+487
-61
lines changed

5 files changed

+487
-61
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.6-dev5
1+
## 0.14.6-dev6
22

33
### Enhancements
44

@@ -13,6 +13,7 @@
1313
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
1414
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
1515
* **Dropbox connector internal file path bugs** Dropbox source connector currently raises exceptions when indexing files due to two issues: a path formatting idiosyncrasy of the Dropbox library and a divergence in the definition of the Dropbox libraries fs.info method, expecting a 'url' parameter rather than 'path'.
16+
* **update table metric evaluation to handle corrected HTML syntax for tables** This change is connected to the update in [unstructured-inference change](https://github.com/Unstructured-IO/unstructured-inference/pull/355) - fixes transforming HTML table to deckerd and internal cells format.
1617

1718
## 0.14.5
1819

test_unstructured/metrics/test_table_structure.py

Lines changed: 98 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
3333
{
3434
"type": "Table",
3535
"metadata": {
36-
"text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
36+
"text_as_html": """<table><thead><tr><th>r1c1</th><th>r1c2</th></tr></thead>
3737
<tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
3838
},
3939
}
@@ -201,14 +201,62 @@ def test_table_eval_processor_when_wrong_source_type():
201201
@pytest.mark.parametrize(
202202
"text_as_html",
203203
[
204-
"""<table><thead><th>r1c1</th><th>r1c2</th></thead>
205-
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
206-
<td>r3c2</td></tr></tbody></table>""",
207-
"""<table><tr><th>r1c1</th><th>r1c2</th></tr>
208-
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
209-
<td>r3c2</td></tr></tbody></table>""",
210-
"""<table><tr><td>r1c1</td><td>r1c2</td></tr><tr><td>r2c1</td>
211-
<td>r2c2</td></tr><tr><td>r3c1</td><td>r3c2</td></tr></tbody></table>""",
204+
"""
205+
<table>
206+
<thead>
207+
<tr>
208+
<th>r1c1</th>
209+
<th>r1c2</th>
210+
</tr>
211+
</thead>
212+
<tbody>
213+
<tr>
214+
<td>r2c1</td>
215+
<td>r2c2</td>
216+
</tr>
217+
<tr>
218+
<td>r3c1</td>
219+
<td>r3c2</td>
220+
</tr>
221+
</tbody>
222+
</table>
223+
""",
224+
"""
225+
<table>
226+
<tr>
227+
<th>r1c1</th>
228+
<th>r1c2</th>
229+
</tr>
230+
<tbody>
231+
<tr>
232+
<td>r2c1</td>
233+
<td>r2c2</td>
234+
</tr>
235+
<tr>
236+
<td>r3c1</td>
237+
<td>r3c2</td>
238+
</tr>
239+
</tbody>
240+
</table>
241+
""",
242+
"""
243+
<table>
244+
</tbody>
245+
<tr>
246+
<td>r1c1</td>
247+
<td>r1c2</td>
248+
</tr>
249+
<tr>
250+
<td>r2c1</td>
251+
<td>r2c2</td>
252+
</tr>
253+
<tr>
254+
<td>r3c1</td>
255+
<td>r3c2</td>
256+
</tr>
257+
</tbody>
258+
</table>
259+
""",
212260
],
213261
)
214262
def test_table_eval_processor_various_table_html_structures(text_as_html):
@@ -285,8 +333,21 @@ def test_table_eval_processor_non_str_values_in_table():
285333
{
286334
"type": "Table",
287335
"metadata": {
288-
"text_as_html": """<table><thead><th>11</th><th>12</th></thead>
289-
<tbody><tr><td>21</td><td>22</td></tr></tbody></table>"""
336+
"text_as_html": """
337+
<table>
338+
<thead>
339+
<tr>
340+
<th>11</th>
341+
<th>12</th>
342+
</tr>
343+
</thead>
344+
<tbody>
345+
<tr>
346+
<td>21</td>
347+
<td>22</td>
348+
</tr>
349+
</tbody>
350+
</table>"""
290351
},
291352
}
292353
]
@@ -341,19 +402,38 @@ def test_table_eval_processor_non_str_values_in_table():
341402
assert result.element_col_level_content_acc == 1.0
342403

343404

344-
@pytest.mark.xfail(
345-
reason="This is expected to fail as table eval metrics does not cover merged cells"
346-
)
347405
def test_table_eval_processor_merged_cells():
348406
prediction = [
349407
{
350408
"type": "Table",
351409
"metadata": {
352410
"text_as_html": """
353-
<table><thead><th rowspan="2">r1c1</th><th>r1c2</th><th colspan="2">r1c3</th></tr>
354-
<tr><th>r2c2</th><th>r2c3</th><th>r2c4</th><</thead>
355-
<tbody><tr><td>r3c1</td><td>r3c2</td><td colspan="2" rowspan="2">r3c3</td></tr>
356-
<tr><td>r4c1</td><td>r4c2</td></tr></tbody></table>"""
411+
<table>
412+
<thead>
413+
<tr>
414+
<th rowspan="2">r1c1</th>
415+
<th>r1c2</th>
416+
<th colspan="2">r1c3</th>
417+
</tr>
418+
<tr>
419+
<th>r2c2</th>
420+
<th>r2c3</th>
421+
<th>r2c4</th>
422+
</tr>
423+
</thead>
424+
<tbody>
425+
<tr>
426+
<td>r3c1</td>
427+
<td>r3c2</td>
428+
<td colspan="2" rowspan="2">r3c3</td>
429+
</tr>
430+
<tr>
431+
<td>r4c1</td>
432+
<td>r4c2</td>
433+
</tr>
434+
</tbody>
435+
</table>
436+
"""
357437
},
358438
}
359439
]

0 commit comments

Comments
 (0)