Merge pull request #24 from kensho-technologies/val/add-figure-types

valerie-fauconmorin-kensho · web-flow · commit efa11b1a98c1 · 2025-04-24T13:32:40.000-04:00
Add Figure ContentCategory
diff --git a/kensho_kenverters/CHANGELOG.md b/kensho_kenverters/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## v1.2.5
+* Add handling for FIGURE types in Extract output
+
 ## v1.2.4
 
 * Add conversion from a given table annotation to grid, finding the first and last associated text object
diff --git a/kensho_kenverters/constants.py b/kensho_kenverters/constants.py
@@ -52,8 +52,10 @@ class ContentCategory(Enum):
     PAGE_FOOTNOTE = "PAGE_FOOTNOTE"
     TABLE_OF_CONTENTS = "TABLE_OF_CONTENTS"
     TABLE_OF_CONTENTS_TITLE = "TABLE_OF_CONTENTS_TITLE"
+    # Figure types
     FIGURE_EXTRACTED_TABLE = "FIGURE_EXTRACTED_TABLE"
     FIGURE_EXTRACTED_TABLE_CELL = "FIGURE_EXTRACTED_TABLE_CELL"
+    FIGURE = "FIGURE"
 
 
 ELEMENT_TITLE_CONTENT_CATEGORIES = {
@@ -78,3 +80,9 @@ class ContentCategory(Enum):
     ContentCategory.TABLE_OF_CONTENTS.value,
     ContentCategory.FIGURE_EXTRACTED_TABLE.value,
 }
+
+FIGURE_CONTENT_CATEGORIES = {
+    ContentCategory.FIGURE.value,
+}
+
+EMPTY_STRING = ""
diff --git a/kensho_kenverters/convert_output.py b/kensho_kenverters/convert_output.py
@@ -9,6 +9,7 @@
     CATEGORY_KEY,
     DOCUMENT_CATEGORY_KEY,
     ELEMENT_TITLE_CONTENT_CATEGORIES,
+    EMPTY_STRING,
     LOCATIONS_KEY,
     TABLE_CONTENT_CATEGORIES,
     TABLE_KEY,
@@ -138,7 +139,7 @@ def _create_segment(
     elif content.type in [e.value for e in ContentCategory]:
         segment = {
             CATEGORY_KEY: content.type.lower(),
-            TEXT_KEY: content.content,
+            TEXT_KEY: content.content or EMPTY_STRING,
         }
     else:
         raise TypeError(
@@ -235,7 +236,7 @@ def convert_output_to_str(serialized_document: dict[str, Any]) -> str:
         full text string of the document with markdown-style tables using | as a delimiter
     """
     document_items = convert_output_to_items_list(serialized_document)
-    return "\n".join(item[TEXT_KEY] for item in document_items)
+    return "\n".join(item[TEXT_KEY] for item in document_items if item[TEXT_KEY])
 
 
 def convert_output_to_str_by_page(serialized_document: dict[str, Any]) -> list[str]:
@@ -285,6 +286,9 @@ def convert_output_to_markdown(serialized_document: dict[str, Any]) -> str:
     document_items = convert_output_to_items_list(serialized_document)
     item_texts = []
     for item in document_items:
+        # Some types like figures don't have content
+        if not item[TEXT_KEY]:
+            continue
         item_text = _get_markdown_text(item)
         item_texts.append(item_text)
     return "\n".join(item_texts)
diff --git a/kensho_kenverters/convert_output_visual_formatted.py b/kensho_kenverters/convert_output_visual_formatted.py
@@ -9,6 +9,7 @@
 from typing import Any, TypeAlias
 
 from kensho_kenverters.constants import (
+    EMPTY_STRING,
     LOCATIONS_KEY,
     TABLE_CONTENT_CATEGORIES,
     TEXT_KEY,
@@ -87,7 +88,7 @@ def _convert_output_to_texts_with_locs(
             segments += table_cell_segments
         elif content.type in [e.value for e in ContentCategory]:
             segment: dict[str, Any] = {
-                TEXT_KEY: content.content,
+                TEXT_KEY: content.content or EMPTY_STRING,
                 LOCATIONS_KEY: content.locations,
             }
             segments.append(segment)
diff --git a/kensho_kenverters/output_to_tables.py b/kensho_kenverters/output_to_tables.py
@@ -8,6 +8,7 @@
 import pandas as pd
 
 from kensho_kenverters.constants import (
+    EMPTY_STRING,
     TABLE_CONTENT_CATEGORIES,
     AnnotationType,
     ContentCategory,
@@ -181,7 +182,7 @@ def convert_uid_grid_to_content_grid(
     uid_grid: list[list[list[str]]], cell_contents: Sequence[ContentModel]
 ) -> list[list[str]]:
     """Convert a UID grid to content grid."""
-    uids_to_content = {cell.uid: cell.content for cell in cell_contents}
+    uids_to_content = {cell.uid: cell.content or EMPTY_STRING for cell in cell_contents}
 
     content_grid = []
     for uid_row in uid_grid:
diff --git a/kensho_kenverters/tests/data/extract_output.json b/kensho_kenverters/tests/data/extract_output.json
diff --git a/kensho_kenverters/tests/test_convert_output.py b/kensho_kenverters/tests/test_convert_output.py
@@ -116,6 +116,7 @@ def test_convert_output_to_items(self) -> None:
                 " In its application across business problems, machine learning is also referred "
                 "to as predictive analytics.",
             },
+            {"category": "figure", "text": ""},
             {"category": "title", "text": "Recommendation: BUY"},
             {"category": "text", "text": "42"},
             {"category": "text", "text": "test noise string at bottom"},
@@ -252,6 +253,19 @@ def test_convert_output_to_items(self) -> None:
                     )
                 ],
             },
+            {
+                "category": "figure",
+                "text": "",
+                "locations": [
+                    LocationModel(
+                        height=0.01425,
+                        width=0.21622,
+                        x=0.60002,
+                        y=0.8388,
+                        page_number=0,
+                    )
+                ],
+            },
             {
                 "category": "title",
                 "text": "Recommendation: BUY",
diff --git a/kensho_kenverters/tests/test_output_to_sections.py b/kensho_kenverters/tests/test_output_to_sections.py
@@ -99,6 +99,7 @@ def test_extract_organized_sections(self) -> None:
                         "hine learning is also referred to as predictive analytics."
                     ),
                 },
+                {"category": "figure", "text": ""},
             ],
             [
                 {"category": "title", "text": "Recommendation: BUY"},
@@ -959,6 +960,21 @@ def test_extract_organized_sections(self) -> None:
                         "type": "TEXT",
                         "uid": "32",
                     },
+                    {
+                        "children": [],
+                        "content": None,
+                        "locations": [
+                            {
+                                "height": 0.01425,
+                                "page_number": 0,
+                                "width": 0.21622,
+                                "x": 0.60002,
+                                "y": 0.8388,
+                            }
+                        ],
+                        "type": "FIGURE",
+                        "uid": "33",
+                    },
                     {
                         "children": [],
                         "content": "Recommendation: BUY",
@@ -972,7 +988,7 @@ def test_extract_organized_sections(self) -> None:
                             }
                         ],
                         "type": "TITLE",
-                        "uid": "33",
+                        "uid": "34",
                     },
                     {
                         "children": [],
@@ -987,7 +1003,7 @@ def test_extract_organized_sections(self) -> None:
                             }
                         ],
                         "type": "TEXT",
-                        "uid": "34",
+                        "uid": "35",
                     },
                     {
                         "children": [],
@@ -1002,7 +1018,7 @@ def test_extract_organized_sections(self) -> None:
                             }
                         ],
                         "type": "TEXT",
-                        "uid": "35",
+                        "uid": "36",
                     },
                     {
                         "children": [],
@@ -1017,7 +1033,7 @@ def test_extract_organized_sections(self) -> None:
                             }
                         ],
                         "type": "TEXT",
-                        "uid": "36",
+                        "uid": "37",
                     },
                 ],
                 "content": None,
@@ -1878,6 +1894,21 @@ def test_extract_organized_sections(self) -> None:
                         "type": "TEXT",
                         "uid": "32",
                     },
+                    {
+                        "children": [],
+                        "content": None,
+                        "locations": [
+                            {
+                                "height": 0.01425,
+                                "page_number": 0,
+                                "width": 0.21622,
+                                "x": 0.60002,
+                                "y": 0.8388,
+                            }
+                        ],
+                        "type": "FIGURE",
+                        "uid": "33",
+                    },
                     {
                         "children": [],
                         "content": "Recommendation: BUY",
@@ -1891,7 +1922,7 @@ def test_extract_organized_sections(self) -> None:
                             }
                         ],
                         "type": "TITLE",
-                        "uid": "33",
+                        "uid": "34",
                     },
                 ],
                 "content": None,
@@ -1975,6 +2006,7 @@ def test_extract_organized_sections(self) -> None:
                         "hine learning is also referred to as predictive analytics."
                     ),
                 },
+                {"category": "figure", "text": ""},
             ],
             [{"category": "title", "text": "Recommendation: BUY"}],
         ]
diff --git a/kensho_kenverters/tests/test_output_to_tables.py b/kensho_kenverters/tests/test_output_to_tables.py
@@ -1878,6 +1878,21 @@ def test_build_table_grids_figure_extracted_table_structure(self) -> None:
                             "type": "FIGURE_EXTRACTED_TABLE",
                             "uid": "35",
                         },
+                        {
+                            "children": [],
+                            "content": None,
+                            "locations": [
+                                {
+                                    "height": 0.12461,
+                                    "page_number": 0,
+                                    "width": 0.34248,
+                                    "x": 0.50986,
+                                    "y": 0.34171,
+                                }
+                            ],
+                            "type": "FIGURE",
+                            "uid": "599",
+                        },
                         {
                             "children": [],
                             "content": "789",
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "kensho_kenverters"
-version = "1.2.4"
+version = "1.2.5"
 description = "Extract Output Translator Tools"
 readme = "README.md"
 authors = ["Valerie Faucon-Morin <valerie.fauconmorin@kensho.com>"]