Skip to content

Commit 404f3c3

Browse files
add test
1 parent 9145497 commit 404f3c3

File tree

4 files changed

+67
-17
lines changed

4 files changed

+67
-17
lines changed

kensho_kenverters/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Changelog
22

3+
## v1.2.3
4+
5+
Remove table validation for rows and columns to not fail downstream of Extract model failures
6+
37
## v1.2.2
48

59
Use underscores instead of hyphens for setuptools 78

kensho_kenverters/tables_utils.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def _create_empty_annotation(row: int, col: int) -> AnnotationModel:
2323

2424

2525
def _validate_annotations(
26-
duplicated_annotations: list[AnnotationModel],
26+
duplicated_annotations: list[AnnotationModel], max_row: int, max_col: int
2727
) -> list[AnnotationModel]:
2828
"""Validate duplicated annotations."""
2929
# Check all spans are 1 (annotations are duplicated)
@@ -36,19 +36,10 @@ def _validate_annotations(
3636
if len(set(all_indices)) != len(all_indices):
3737
raise ValueError("Overlapping indices in table.")
3838

39-
# If any empty rows or columns, add empty annotations
40-
existing_rows = set(index[0] for index in all_indices)
41-
existing_columns = set(index[1] for index in all_indices)
42-
full_rows = set(range(max(existing_rows, default=0) + 1))
43-
full_columns = set(range(max(existing_columns, default=0) + 1))
44-
45-
for row in full_rows - existing_rows:
46-
for col in full_columns:
47-
duplicated_annotations.append(_create_empty_annotation(row, col))
48-
49-
for col in full_columns - existing_columns:
50-
for row in full_rows:
51-
duplicated_annotations.append(_create_empty_annotation(row, col))
39+
for row in range(max_row + 1):
40+
for col in range(max_col + 1):
41+
if (row, col) not in all_indices:
42+
duplicated_annotations.append(_create_empty_annotation(row, col))
5243

5344
return duplicated_annotations
5445

@@ -70,6 +61,8 @@ def duplicate_spanning_annotations(
7061
duplicated annotations. Duplicated annotations must all have span (1, 1).
7162
"""
7263
duplicated_annotations = []
64+
max_row = 0
65+
max_col = 0
7366
for annotation in annotations:
7467
data = annotation.data
7568
row_span, col_span = data.span
@@ -94,8 +87,10 @@ def duplicate_spanning_annotations(
9487
locations=annotation.locations,
9588
)
9689
duplicated_annotations.append(new_annotation)
90+
max_row = max(max_row, row_index + row_span_index)
91+
max_col = max(max_col, col_index + col_span_index)
9792

98-
return _validate_annotations(duplicated_annotations)
93+
return _validate_annotations(duplicated_annotations, max_row, max_col)
9994

10095

10196
def get_table_shape(

kensho_kenverters/tests/test_tables_utils.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def test_convert_tpdle_to_pd_df(self) -> None:
5757
)
5858
self.assertEqual(expected_df.to_csv(), converted_df.to_csv())
5959

60-
def test_duplicate_spanning_annotations(self) -> None:
60+
def test_duplicate_spanning_annotations_1_spans(self) -> None:
6161
# Test no duplication when all spans are 1
6262
duplicated = duplicate_spanning_annotations(
6363
self.parsed_serialized_document.annotations
@@ -417,6 +417,7 @@ def test_duplicate_spanning_annotations(self) -> None:
417417

418418
self.assertEqual(duplicated, expected_duplicated)
419419

420+
def test_duplicate_spanning_annotations_greater_1_spans(self) -> None:
420421
# Test properly duplicated when cells have a span > 1
421422
annotations = self.parsed_serialized_document.annotations
422423
annotations.pop(-1)
@@ -775,3 +776,53 @@ def test_duplicate_spanning_annotations(self) -> None:
775776
]
776777
duplicated = duplicate_spanning_annotations(annotations)
777778
self.assertEqual(expected_duplicated, duplicated)
779+
780+
def test_duplicate_spanning_annotations_missing_cells(self) -> None:
781+
# Test missing cells
782+
annotations = [
783+
AnnotationModel(
784+
content_uids=["7"],
785+
data=AnnotationDataModel(index=(0, 0), span=(1, 1)),
786+
type="table_structure",
787+
locations=[
788+
LocationModel(
789+
height=0.01188,
790+
width=0.22128,
791+
x=0.16008,
792+
y=0.40464,
793+
page_number=0,
794+
)
795+
],
796+
),
797+
AnnotationModel(
798+
content_uids=["8"],
799+
data=AnnotationDataModel(index=(0, 1), span=(1, 1)),
800+
type="table_structure",
801+
locations=[
802+
LocationModel(
803+
height=0.01188,
804+
width=0.22128,
805+
x=0.16008,
806+
y=0.80464,
807+
page_number=0,
808+
)
809+
],
810+
),
811+
AnnotationModel(
812+
content_uids=["9"],
813+
data=AnnotationDataModel(index=(3, 0), span=(1, 1)),
814+
type="table_structure",
815+
locations=[
816+
LocationModel(
817+
height=0.01188,
818+
width=0.22128,
819+
x=0.7008,
820+
y=0.40464,
821+
page_number=0,
822+
)
823+
],
824+
),
825+
]
826+
827+
duplicated = duplicate_spanning_annotations(annotations)
828+
self.assertEqual(len(duplicated), 8)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "kensho_kenverters"
3-
version = "1.2.2"
3+
version = "1.2.3"
44
description = "Extract Output Translator Tools"
55
readme = "README.md"
66
authors = ["Valerie Faucon-Morin <valerie.fauconmorin@kensho.com>"]

0 commit comments

Comments
 (0)