Unstructured-IO · badGarnet · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
+## 0.8.6
+
+* feat: add back `source` to `TextRegions` and `LayoutElements` for backward compatibility
+
 ## 0.8.5
 
-* fix: remove `pdfplumber` but include `pdfminer-six==20240706` to update `pdfminer` 
+* fix: remove `pdfplumber` but include `pdfminer-six==20240706` to update `pdfminer`
 
 ## 0.8.4
 

diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py
@@ -61,7 +61,7 @@ def test_layoutelements():
         element_coords=coords,
         element_class_ids=element_class_ids,
         element_class_id_map=class_map,
-        sources=np.array(["yolox"] * len(element_class_ids)),
+        source="yolox",
     )
 
 
@@ -441,20 +441,22 @@ def test_layoutelements_to_list_and_back(test_layoutelements):
 def test_layoutelements_from_list_no_elements():
     back = LayoutElements.from_list(elements=[])
     assert back.sources.size == 0
+    assert back.source is None
     assert back.element_coords.size == 0
 
 
 def test_textregions_from_list_no_elements():
     back = TextRegions.from_list(regions=[])
     assert back.sources.size == 0
+    assert back.source is None
     assert back.element_coords.size == 0
 
 
 def test_layoutelements_concatenate():
     layout1 = LayoutElements(
         element_coords=np.array([[0, 0, 1, 1], [1, 1, 2, 2]]),
         texts=np.array(["a", "two"]),
-        sources=np.array(["yolox", "yolox"]),
+        source="yolox",
         element_class_ids=np.array([0, 1]),
         element_class_id_map={0: "type0", 1: "type1"},
     )

diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.8.5"  # pragma: no cover
+__version__ = "0.8.6"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -211,11 +211,21 @@ class TextRegions:
     element_coords: np.ndarray
     texts: np.ndarray = field(default_factory=lambda: np.array([]))
     sources: np.ndarray = field(default_factory=lambda: np.array([]))
+    source: Source | None = None
 
     def __post_init__(self):
         if self.texts.size == 0 and self.element_coords.size > 0:
             self.texts = np.array([None] * self.element_coords.shape[0])
 
+        # for backward compatibility; also allow to use one value to set sources for all regions
+        if self.sources.size == 0 and self.element_coords.size > 0:
+            self.sources = np.array([self.source] * self.element_coords.shape[0])
+        elif self.source is None and self.sources.size:
+            self.source = self.sources[0]
+
+        # we convert to float so data type is more consistent (e.g., None will be np.nan)
+        self.element_coords = self.element_coords.astype(float)
+
     def slice(self, indices) -> TextRegions:
         """slice text regions based on indices"""
         return TextRegions(

diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -41,13 +41,18 @@ def __post_init__(self):
             "element_probs",
             "element_class_ids",
             "texts",
-            "sources",
             "text_as_html",
             "table_as_cells",
         ):
             if getattr(self, attr).size == 0 and element_size:
                 setattr(self, attr, np.array([None] * element_size))
 
+        # for backward compatibility; also allow to use one value to set sources for all regions
+        if self.sources.size == 0 and self.element_coords.size > 0:
+            self.sources = np.array([self.source] * self.element_coords.shape[0])
+        elif self.source is None and self.sources.size:
+            self.source = self.sources[0]
+
         self.element_probs = self.element_probs.astype(float)
 
     def __eq__(self, other: object) -> bool:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.8.5" # pragma: no cover
		__version__ = "0.8.6" # pragma: no cover