kensho-technologies
diff --git a/‎kensho_kenverters/CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎kensho_kenverters/CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎kensho_kenverters/constants.py‎
Lines changed: 1 addition & 0 deletions b/‎kensho_kenverters/constants.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎kensho_kenverters/convert_output.py‎
Lines changed: 70 additions & 9 deletions b/‎kensho_kenverters/convert_output.py‎
Lines changed: 70 additions & 9 deletions
diff --git a/‎kensho_kenverters/convert_output_visual_formatted.py‎
Lines changed: 13 additions & 5 deletions b/‎kensho_kenverters/convert_output_visual_formatted.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎kensho_kenverters/output_to_sections.py‎
Lines changed: 2 additions & 2 deletions b/‎kensho_kenverters/output_to_sections.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎kensho_kenverters/output_to_tables.py‎
Lines changed: 6 additions & 5 deletions b/‎kensho_kenverters/output_to_tables.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎kensho_kenverters/tables_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎kensho_kenverters/tables_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎kensho_kenverters/tests/data/extract_output_figure_extraction.json‎
Lines changed: 1 addition & 0 deletions b/‎kensho_kenverters/tests/data/extract_output_figure_extraction.json‎
Lines changed: 1 addition & 0 deletions
@@ -1,5 +1,9 @@
 # Changelog
 
+## v1.2.9
+
+* Fixed bug for creating figure extraction segments in convert_output_to_items_list 
+
 ## v1.2.8
 
 * Fixing edge case where we want to build a table with only figure extracted table annotations or only table annotations
 
@@ -7,6 +7,7 @@
 CATEGORY_KEY = "category"
 TEXT_KEY = "text"
 TABLE_KEY = "table"
+FIGURE_EXTRACTED_TABLE_KEY = "figure_extracted_table"
 LOCATIONS_KEY = "locations"
 DOCUMENT_CATEGORY_KEY = "DOCUMENT"
 
 
@@ -5,21 +5,26 @@
 from logging import getLogger
 from typing import Any
 
-from kensho_kenverters.constants import (
+from .constants import (
     CATEGORY_KEY,
     DOCUMENT_CATEGORY_KEY,
     ELEMENT_TITLE_CONTENT_CATEGORIES,
     EMPTY_STRING,
+    FIGURE_EXTRACTED_TABLE_KEY,
     LOCATIONS_KEY,
-    TABLE_CONTENT_CATEGORIES,
     TABLE_KEY,
     TEXT_KEY,
     AnnotationType,
     ContentCategory,
     TableType,
 )
-from kensho_kenverters.extract_output_models import ContentModel, LocationModel
-from kensho_kenverters.utils import load_output_to_pydantic
+from .extract_output_models import AnnotationModel, ContentModel, LocationModel
+from .output_to_tables import (
+    build_content_grid_from_figure_extracted_table_cell_annotations,
+    get_table_uid_to_annotations_mapping,
+    get_table_uid_to_cells_mapping,
+)
+from .utils import load_output_to_pydantic
 
 logger = getLogger(__name__)
 
@@ -110,14 +115,18 @@ def _create_segment(
     content: ContentModel,
     uid_to_index: dict[str, tuple[int, int]],
     uid_to_span: dict[str, tuple[int, int]],
+    figure_extracted_table_uid_to_cell_annotations: dict[str, list[AnnotationModel]],
 ) -> dict[str, Any]:
     """Create segment dictionary from the content, and if applicable its matching table cells."""
     segment: dict[str, Any] = {}
     # DOCUMENT is just a head node
     if content.type == DOCUMENT_CATEGORY_KEY:
         return {}
     # For tables, use table cell structures read above
-    elif content.type in TABLE_CONTENT_CATEGORIES:
+    elif content.type in (
+        ContentCategory.TABLE.value,
+        ContentCategory.TABLE_OF_CONTENTS.value,
+    ):
         # Construct the table from cells
         table_cells = content.children
         # Drop tables with no cells
@@ -132,7 +141,24 @@ def _create_segment(
             TABLE_KEY: table,
             TEXT_KEY: table_to_markdown(table),
         }
-    elif content.type == ContentCategory.TABLE_CELL.value:
+    elif content.type == ContentCategory.FIGURE_EXTRACTED_TABLE.value:
+        figure_extracted_table = (
+            build_content_grid_from_figure_extracted_table_cell_annotations(
+                figure_extracted_table_uid_to_cell_annotations[content.uid]
+            )
+        )
+        # Drop tables with length 0
+        if len(figure_extracted_table) == 0:
+            return {}
+        segment = {
+            CATEGORY_KEY: content.type.lower(),
+            FIGURE_EXTRACTED_TABLE_KEY: figure_extracted_table,
+            TEXT_KEY: table_to_markdown(figure_extracted_table),
+        }
+    elif content.type in (
+        ContentCategory.TABLE_CELL.value,
+        ContentCategory.FIGURE_EXTRACTED_TABLE_CELL.value,
+    ):
         # Skip - already accounted for in tables
         return {}
     # For texts and titles, add the text content and the category
@@ -153,6 +179,7 @@ def _get_segments_from_all_children(
     content: ContentModel,
     uid_to_index: dict[str, tuple[int, int]],
     uid_to_span: dict[str, tuple[int, int]],
+    figure_extracted_table_uid_to_cell_annotations: dict[str, list[AnnotationModel]],
     return_locations: bool,
     segments: list[dict[str, Any]],
     visited: list[str],
@@ -162,7 +189,12 @@ def _get_segments_from_all_children(
         return
 
     # Get current segment from content and add to list
-    segment = _create_segment(content, uid_to_index, uid_to_span)
+    segment = _create_segment(
+        content,
+        uid_to_index,
+        uid_to_span,
+        figure_extracted_table_uid_to_cell_annotations,
+    )
     visited.append(content.uid)
     if segment:
         if return_locations:
@@ -172,7 +204,13 @@ def _get_segments_from_all_children(
     # Get all children segments
     for child in content.children:
         _get_segments_from_all_children(
-            child, uid_to_index, uid_to_span, return_locations, segments, visited
+            child,
+            uid_to_index,
+            uid_to_span,
+            figure_extracted_table_uid_to_cell_annotations,
+            return_locations,
+            segments,
+            visited,
         )
 
 
@@ -213,15 +251,38 @@ def convert_output_to_items_list(
             for uid in content_uids:
                 uid_to_index[uid] = (row, col)
                 uid_to_span[uid] = annotation.data.span
+        elif annotation.type == AnnotationType.FIGURE_EXTRACTED_TABLE_STRUCTURE.value:
+            continue
         else:
             raise TypeError(f"{annotation.type} is not a supported annotation type")
 
+    figure_extracted_table_cell_annotations = [
+        annotation
+        for annotation in annotations
+        if annotation.type == AnnotationType.FIGURE_EXTRACTED_TABLE_STRUCTURE.value
+    ]
+    table_uid_to_cells_mapping = get_table_uid_to_cells_mapping(
+        parsed_serialized_document.content_tree
+    )
+    figure_extracted_table_uid_to_cell_annotations = (
+        get_table_uid_to_annotations_mapping(
+            table_uid_to_cells_mapping,
+            figure_extracted_table_cell_annotations,
+        )
+    )
+
     # Parse content into segments
     content_tree = parsed_serialized_document.content_tree
     segments: list[dict[str, Any]] = []
     visited: list[str] = []
     _get_segments_from_all_children(
-        content_tree, uid_to_index, uid_to_span, return_locations, segments, visited
+        content_tree,
+        uid_to_index,
+        uid_to_span,
+        figure_extracted_table_uid_to_cell_annotations,
+        return_locations,
+        segments,
+        visited,
     )
     return segments
 
 
@@ -8,16 +8,15 @@
 from logging import getLogger
 from typing import Any, TypeAlias
 
-from kensho_kenverters.constants import (
+from .constants import (
     EMPTY_STRING,
     LOCATIONS_KEY,
-    TABLE_CONTENT_CATEGORIES,
     TEXT_KEY,
     AnnotationType,
     ContentCategory,
 )
-from kensho_kenverters.extract_output_models import ContentModel, LocationModel
-from kensho_kenverters.utils import load_output_to_pydantic
+from .extract_output_models import ContentModel, LocationModel
+from .utils import load_output_to_pydantic
 
 logger = getLogger(__name__)
 
@@ -68,6 +67,9 @@ def _convert_output_to_texts_with_locs(
             content_uids = annotation.content_uids  # a list
             for uid in content_uids:
                 uid_to_location[uid] = annotation.locations
+        # For visual formatting, we don't want to include figure extracted tables
+        elif annotation.type == AnnotationType.FIGURE_EXTRACTED_TABLE_STRUCTURE.value:
+            continue
         else:
             raise TypeError(f"{annotation.type} is not a supported annotation type")
 
@@ -76,7 +78,10 @@ def _convert_output_to_texts_with_locs(
     segments: list[dict[str, Any]] = []
     for content in content_tree.children:
         # For tables, use table cell structures read above
-        if content.type in TABLE_CONTENT_CATEGORIES:
+        if content.type in (
+            ContentCategory.TABLE.value,
+            ContentCategory.TABLE_OF_CONTENTS.value,
+        ):
             # Construct the table from cells
             table_cells = content.children
             # Drop tables with no cells
@@ -86,6 +91,9 @@ def _convert_output_to_texts_with_locs(
                 table_cells, uid_to_location
             )
             segments += table_cell_segments
+        # For visual formatting, we don't want to include figure extracted tables
+        elif content.type == ContentCategory.FIGURE_EXTRACTED_TABLE.value:
+            continue
         elif content.type in [e.value for e in ContentCategory]:
             segment: dict[str, Any] = {
                 TEXT_KEY: content.content or EMPTY_STRING,
 
@@ -3,8 +3,8 @@
 
 from typing import Any
 
-from kensho_kenverters.constants import CATEGORY_KEY, ContentCategory
-from kensho_kenverters.convert_output import convert_output_to_items_list
+from .constants import CATEGORY_KEY, ContentCategory
+from .convert_output import convert_output_to_items_list
 
 
 def extract_organized_sections(
 
@@ -7,26 +7,27 @@
 
 import pandas as pd
 
-from kensho_kenverters.constants import (
+from .constants import (
     EMPTY_STRING,
     TABLE_CONTENT_CATEGORIES,
     AnnotationType,
     ContentCategory,
+    TableType,
 )
-from kensho_kenverters.extract_output_models import (
+from .extract_output_models import (
     AnnotationModel,
     ContentModel,
     LocationModel,
     LocationType,
     Table,
     TableCategoryType,
 )
-from kensho_kenverters.tables_utils import (
+from .tables_utils import (
     convert_table_to_pd_df,
     duplicate_spanning_annotations,
     get_table_shape,
 )
-from kensho_kenverters.utils import load_output_to_pydantic
+from .utils import load_output_to_pydantic
 
 
 def get_table_uid_to_cells_mapping(
@@ -146,7 +147,7 @@ def build_uids_grid_from_table_cell_annotations(
 
 def build_content_grid_from_figure_extracted_table_cell_annotations(
     annotations: Sequence[AnnotationModel],
-) -> list[list[str]]:
+) -> TableType:
     """Build content grid where each location has a string of content."""
     if any(
         annotation.type != AnnotationType.FIGURE_EXTRACTED_TABLE_STRUCTURE.value
 
@@ -5,8 +5,8 @@
 
 import pandas as pd
 
-from kensho_kenverters.constants import AnnotationType
-from kensho_kenverters.extract_output_models import AnnotationDataModel, AnnotationModel
+from .constants import AnnotationType
+from .extract_output_models import AnnotationDataModel, AnnotationModel
 
 
 def _create_empty_annotation(row: int, col: int) -> AnnotationModel: