Skip to content

Commit 3834f7f

Browse files
Merge pull request #18 from kensho-technologies/val/remove-table-validations
Remove validations for rows/cols for tables
2 parents 98460cc + 78f1079 commit 3834f7f

File tree

4 files changed

+88
-16
lines changed

4 files changed

+88
-16
lines changed

kensho_kenverters/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Changelog
22

3+
## v1.2.3
4+
5+
Remove table validation for rows and columns to not fail downstream of Extract model failures
6+
37
## v1.2.2
48

59
Use underscores instead of hyphens for setuptools 78

kensho_kenverters/tables_utils.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,27 @@
99
from kensho_kenverters.extract_output_models import AnnotationDataModel, AnnotationModel
1010

1111

12-
def _check_complete_set(integer_set: set[int]) -> bool:
13-
"""Check that the set of integers contains all integers between 0 and its max."""
14-
return integer_set == set(range(max(integer_set, default=-1) + 1))
12+
def _create_empty_annotation(row: int, col: int) -> AnnotationModel:
13+
"""Create an empty annotation."""
14+
return AnnotationModel(
15+
type=AnnotationType.TABLE_STRUCTURE.value,
16+
content_uids=[],
17+
data=AnnotationDataModel(
18+
span=(1, 1),
19+
index=(row, col),
20+
),
21+
locations=None,
22+
)
23+
24+
25+
def _validate_annotations(
26+
duplicated_annotations: list[AnnotationModel], max_row: int, max_col: int
27+
) -> list[AnnotationModel]:
28+
"""Validate duplicated annotations.
1529
30+
Fill with empty annotations if rows or columns are missing.
31+
"""
1632

17-
def _validate_annotations(duplicated_annotations: Sequence[AnnotationModel]) -> None:
18-
"""Validate duplicated annotations."""
1933
# Check all spans are 1 (annotations are duplicated)
2034
all_spans = [annotation.data.span for annotation in duplicated_annotations]
2135
if any(span != (1, 1) for span in all_spans):
@@ -26,13 +40,13 @@ def _validate_annotations(duplicated_annotations: Sequence[AnnotationModel]) ->
2640
if len(set(all_indices)) != len(all_indices):
2741
raise ValueError("Overlapping indices in table.")
2842

29-
# Check no empty rows or columns
30-
all_rows = set(index[0] for index in all_indices)
31-
all_columns = set(index[1] for index in all_indices)
32-
if not _check_complete_set(all_rows):
33-
raise ValueError("Empty row in table.")
34-
if not _check_complete_set(all_columns):
35-
raise ValueError("Empty column in table.")
43+
# Add any missing cells
44+
for row in range(max_row + 1):
45+
for col in range(max_col + 1):
46+
if (row, col) not in all_indices:
47+
duplicated_annotations.append(_create_empty_annotation(row, col))
48+
49+
return duplicated_annotations
3650

3751

3852
def duplicate_spanning_annotations(
@@ -52,6 +66,8 @@ def duplicate_spanning_annotations(
5266
duplicated annotations. Duplicated annotations must all have span (1, 1).
5367
"""
5468
duplicated_annotations = []
69+
max_row = 0
70+
max_col = 0
5571
for annotation in annotations:
5672
data = annotation.data
5773
row_span, col_span = data.span
@@ -76,9 +92,10 @@ def duplicate_spanning_annotations(
7692
locations=annotation.locations,
7793
)
7894
duplicated_annotations.append(new_annotation)
95+
max_row = max(max_row, row_index + row_span_index)
96+
max_col = max(max_col, col_index + col_span_index)
7997

80-
_validate_annotations(duplicated_annotations)
81-
return duplicated_annotations
98+
return _validate_annotations(duplicated_annotations, max_row, max_col)
8299

83100

84101
def get_table_shape(

kensho_kenverters/tests/test_tables_utils.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def test_convert_tpdle_to_pd_df(self) -> None:
5757
)
5858
self.assertEqual(expected_df.to_csv(), converted_df.to_csv())
5959

60-
def test_duplicate_spanning_annotations(self) -> None:
60+
def test_duplicate_spanning_annotations_1_spans(self) -> None:
6161
# Test no duplication when all spans are 1
6262
duplicated = duplicate_spanning_annotations(
6363
self.parsed_serialized_document.annotations
@@ -417,6 +417,7 @@ def test_duplicate_spanning_annotations(self) -> None:
417417

418418
self.assertEqual(duplicated, expected_duplicated)
419419

420+
def test_duplicate_spanning_annotations_greater_1_spans(self) -> None:
420421
# Test properly duplicated when cells have a span > 1
421422
annotations = self.parsed_serialized_document.annotations
422423
annotations.pop(-1)
@@ -775,3 +776,53 @@ def test_duplicate_spanning_annotations(self) -> None:
775776
]
776777
duplicated = duplicate_spanning_annotations(annotations)
777778
self.assertEqual(expected_duplicated, duplicated)
779+
780+
def test_duplicate_spanning_annotations_missing_cells(self) -> None:
781+
# Test missing cells
782+
annotations = [
783+
AnnotationModel(
784+
content_uids=["7"],
785+
data=AnnotationDataModel(index=(0, 0), span=(1, 1)),
786+
type="table_structure",
787+
locations=[
788+
LocationModel(
789+
height=0.01188,
790+
width=0.22128,
791+
x=0.16008,
792+
y=0.40464,
793+
page_number=0,
794+
)
795+
],
796+
),
797+
AnnotationModel(
798+
content_uids=["8"],
799+
data=AnnotationDataModel(index=(0, 1), span=(1, 1)),
800+
type="table_structure",
801+
locations=[
802+
LocationModel(
803+
height=0.01188,
804+
width=0.22128,
805+
x=0.16008,
806+
y=0.80464,
807+
page_number=0,
808+
)
809+
],
810+
),
811+
AnnotationModel(
812+
content_uids=["9"],
813+
data=AnnotationDataModel(index=(3, 0), span=(1, 1)),
814+
type="table_structure",
815+
locations=[
816+
LocationModel(
817+
height=0.01188,
818+
width=0.22128,
819+
x=0.7008,
820+
y=0.40464,
821+
page_number=0,
822+
)
823+
],
824+
),
825+
]
826+
827+
duplicated = duplicate_spanning_annotations(annotations)
828+
self.assertEqual(len(duplicated), 8)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "kensho_kenverters"
3-
version = "1.2.2"
3+
version = "1.2.3"
44
description = "Extract Output Translator Tools"
55
readme = "README.md"
66
authors = ["Valerie Faucon-Morin <valerie.fauconmorin@kensho.com>"]

0 commit comments

Comments
 (0)