Skip to content

Commit efa11b1

Browse files
Merge pull request #24 from kensho-technologies/val/add-figure-types
Add Figure ContentCategory
2 parents 18748a7 + 9cd8eb7 commit efa11b1

File tree

10 files changed

+89
-11
lines changed

10 files changed

+89
-11
lines changed

kensho_kenverters/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## v1.2.5
4+
* Add handling for FIGURE types in Extract output
5+
36
## v1.2.4
47

58
* Add conversion from a given table annotation to grid, finding the first and last associated text object

kensho_kenverters/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,10 @@ class ContentCategory(Enum):
5252
PAGE_FOOTNOTE = "PAGE_FOOTNOTE"
5353
TABLE_OF_CONTENTS = "TABLE_OF_CONTENTS"
5454
TABLE_OF_CONTENTS_TITLE = "TABLE_OF_CONTENTS_TITLE"
55+
# Figure types
5556
FIGURE_EXTRACTED_TABLE = "FIGURE_EXTRACTED_TABLE"
5657
FIGURE_EXTRACTED_TABLE_CELL = "FIGURE_EXTRACTED_TABLE_CELL"
58+
FIGURE = "FIGURE"
5759

5860

5961
ELEMENT_TITLE_CONTENT_CATEGORIES = {
@@ -78,3 +80,9 @@ class ContentCategory(Enum):
7880
ContentCategory.TABLE_OF_CONTENTS.value,
7981
ContentCategory.FIGURE_EXTRACTED_TABLE.value,
8082
}
83+
84+
FIGURE_CONTENT_CATEGORIES = {
85+
ContentCategory.FIGURE.value,
86+
}
87+
88+
EMPTY_STRING = ""

kensho_kenverters/convert_output.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
CATEGORY_KEY,
1010
DOCUMENT_CATEGORY_KEY,
1111
ELEMENT_TITLE_CONTENT_CATEGORIES,
12+
EMPTY_STRING,
1213
LOCATIONS_KEY,
1314
TABLE_CONTENT_CATEGORIES,
1415
TABLE_KEY,
@@ -138,7 +139,7 @@ def _create_segment(
138139
elif content.type in [e.value for e in ContentCategory]:
139140
segment = {
140141
CATEGORY_KEY: content.type.lower(),
141-
TEXT_KEY: content.content,
142+
TEXT_KEY: content.content or EMPTY_STRING,
142143
}
143144
else:
144145
raise TypeError(
@@ -235,7 +236,7 @@ def convert_output_to_str(serialized_document: dict[str, Any]) -> str:
235236
full text string of the document with markdown-style tables using | as a delimiter
236237
"""
237238
document_items = convert_output_to_items_list(serialized_document)
238-
return "\n".join(item[TEXT_KEY] for item in document_items)
239+
return "\n".join(item[TEXT_KEY] for item in document_items if item[TEXT_KEY])
239240

240241

241242
def convert_output_to_str_by_page(serialized_document: dict[str, Any]) -> list[str]:
@@ -285,6 +286,9 @@ def convert_output_to_markdown(serialized_document: dict[str, Any]) -> str:
285286
document_items = convert_output_to_items_list(serialized_document)
286287
item_texts = []
287288
for item in document_items:
289+
# Some types like figures don't have content
290+
if not item[TEXT_KEY]:
291+
continue
288292
item_text = _get_markdown_text(item)
289293
item_texts.append(item_text)
290294
return "\n".join(item_texts)

kensho_kenverters/convert_output_visual_formatted.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from typing import Any, TypeAlias
1010

1111
from kensho_kenverters.constants import (
12+
EMPTY_STRING,
1213
LOCATIONS_KEY,
1314
TABLE_CONTENT_CATEGORIES,
1415
TEXT_KEY,
@@ -87,7 +88,7 @@ def _convert_output_to_texts_with_locs(
8788
segments += table_cell_segments
8889
elif content.type in [e.value for e in ContentCategory]:
8990
segment: dict[str, Any] = {
90-
TEXT_KEY: content.content,
91+
TEXT_KEY: content.content or EMPTY_STRING,
9192
LOCATIONS_KEY: content.locations,
9293
}
9394
segments.append(segment)

kensho_kenverters/output_to_tables.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import pandas as pd
99

1010
from kensho_kenverters.constants import (
11+
EMPTY_STRING,
1112
TABLE_CONTENT_CATEGORIES,
1213
AnnotationType,
1314
ContentCategory,
@@ -181,7 +182,7 @@ def convert_uid_grid_to_content_grid(
181182
uid_grid: list[list[list[str]]], cell_contents: Sequence[ContentModel]
182183
) -> list[list[str]]:
183184
"""Convert a UID grid to content grid."""
184-
uids_to_content = {cell.uid: cell.content for cell in cell_contents}
185+
uids_to_content = {cell.uid: cell.content or EMPTY_STRING for cell in cell_contents}
185186

186187
content_grid = []
187188
for uid_row in uid_grid:

kensho_kenverters/tests/data/extract_output.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

kensho_kenverters/tests/test_convert_output.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ def test_convert_output_to_items(self) -> None:
116116
" In its application across business problems, machine learning is also referred "
117117
"to as predictive analytics.",
118118
},
119+
{"category": "figure", "text": ""},
119120
{"category": "title", "text": "Recommendation: BUY"},
120121
{"category": "text", "text": "42"},
121122
{"category": "text", "text": "test noise string at bottom"},
@@ -252,6 +253,19 @@ def test_convert_output_to_items(self) -> None:
252253
)
253254
],
254255
},
256+
{
257+
"category": "figure",
258+
"text": "",
259+
"locations": [
260+
LocationModel(
261+
height=0.01425,
262+
width=0.21622,
263+
x=0.60002,
264+
y=0.8388,
265+
page_number=0,
266+
)
267+
],
268+
},
255269
{
256270
"category": "title",
257271
"text": "Recommendation: BUY",

kensho_kenverters/tests/test_output_to_sections.py

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def test_extract_organized_sections(self) -> None:
9999
"hine learning is also referred to as predictive analytics."
100100
),
101101
},
102+
{"category": "figure", "text": ""},
102103
],
103104
[
104105
{"category": "title", "text": "Recommendation: BUY"},
@@ -959,6 +960,21 @@ def test_extract_organized_sections(self) -> None:
959960
"type": "TEXT",
960961
"uid": "32",
961962
},
963+
{
964+
"children": [],
965+
"content": None,
966+
"locations": [
967+
{
968+
"height": 0.01425,
969+
"page_number": 0,
970+
"width": 0.21622,
971+
"x": 0.60002,
972+
"y": 0.8388,
973+
}
974+
],
975+
"type": "FIGURE",
976+
"uid": "33",
977+
},
962978
{
963979
"children": [],
964980
"content": "Recommendation: BUY",
@@ -972,7 +988,7 @@ def test_extract_organized_sections(self) -> None:
972988
}
973989
],
974990
"type": "TITLE",
975-
"uid": "33",
991+
"uid": "34",
976992
},
977993
{
978994
"children": [],
@@ -987,7 +1003,7 @@ def test_extract_organized_sections(self) -> None:
9871003
}
9881004
],
9891005
"type": "TEXT",
990-
"uid": "34",
1006+
"uid": "35",
9911007
},
9921008
{
9931009
"children": [],
@@ -1002,7 +1018,7 @@ def test_extract_organized_sections(self) -> None:
10021018
}
10031019
],
10041020
"type": "TEXT",
1005-
"uid": "35",
1021+
"uid": "36",
10061022
},
10071023
{
10081024
"children": [],
@@ -1017,7 +1033,7 @@ def test_extract_organized_sections(self) -> None:
10171033
}
10181034
],
10191035
"type": "TEXT",
1020-
"uid": "36",
1036+
"uid": "37",
10211037
},
10221038
],
10231039
"content": None,
@@ -1878,6 +1894,21 @@ def test_extract_organized_sections(self) -> None:
18781894
"type": "TEXT",
18791895
"uid": "32",
18801896
},
1897+
{
1898+
"children": [],
1899+
"content": None,
1900+
"locations": [
1901+
{
1902+
"height": 0.01425,
1903+
"page_number": 0,
1904+
"width": 0.21622,
1905+
"x": 0.60002,
1906+
"y": 0.8388,
1907+
}
1908+
],
1909+
"type": "FIGURE",
1910+
"uid": "33",
1911+
},
18811912
{
18821913
"children": [],
18831914
"content": "Recommendation: BUY",
@@ -1891,7 +1922,7 @@ def test_extract_organized_sections(self) -> None:
18911922
}
18921923
],
18931924
"type": "TITLE",
1894-
"uid": "33",
1925+
"uid": "34",
18951926
},
18961927
],
18971928
"content": None,
@@ -1975,6 +2006,7 @@ def test_extract_organized_sections(self) -> None:
19752006
"hine learning is also referred to as predictive analytics."
19762007
),
19772008
},
2009+
{"category": "figure", "text": ""},
19782010
],
19792011
[{"category": "title", "text": "Recommendation: BUY"}],
19802012
]

kensho_kenverters/tests/test_output_to_tables.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1878,6 +1878,21 @@ def test_build_table_grids_figure_extracted_table_structure(self) -> None:
18781878
"type": "FIGURE_EXTRACTED_TABLE",
18791879
"uid": "35",
18801880
},
1881+
{
1882+
"children": [],
1883+
"content": None,
1884+
"locations": [
1885+
{
1886+
"height": 0.12461,
1887+
"page_number": 0,
1888+
"width": 0.34248,
1889+
"x": 0.50986,
1890+
"y": 0.34171,
1891+
}
1892+
],
1893+
"type": "FIGURE",
1894+
"uid": "599",
1895+
},
18811896
{
18821897
"children": [],
18831898
"content": "789",

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "kensho_kenverters"
3-
version = "1.2.4"
3+
version = "1.2.5"
44
description = "Extract Output Translator Tools"
55
readme = "README.md"
66
authors = ["Valerie Faucon-Morin <valerie.fauconmorin@kensho.com>"]

0 commit comments

Comments
 (0)