add protection to division by 0 (#193)

badGarnet · web-flow · commit e931bef4d952 · 2023-09-03T10:48:57.000-05:00
This PR resolves #192 by adding a helper function to perform division `a/b` and replace `b` with machine precision for float when `b==0` to avoid division by 0. ## further discussion Do we want to also have the division be type conscious (do not auto convert integers into float, i.e., if int/int we should expect int as output?)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.5.21
+
+* adds `safe_division` to replae 0 with machine epsilon for `float` to avoid division by 0
+* apply `safe_division` to area overlap calculations in `unstructured_inference/inference/elements.py`
+
 ## 0.5.20
 
 * Adds YoloX quantized model
diff --git a/test_unstructured_inference/test_math.py b/test_unstructured_inference/test_math.py
@@ -0,0 +1,12 @@
+import numpy as np
+import pytest
+
+from unstructured_inference.math import FLOAT_EPSILON, safe_division
+
+
+@pytest.mark.parametrize(
+    ("a", "b", "expected"),
+    [(0, 0, 0), (0, 1, 0), (1, 0, np.round(1 / FLOAT_EPSILON, 1)), (2, 3, 0.7)],
+)
+def test_safe_division(a, b, expected):
+    assert np.round(safe_division(a, b), 1) == expected
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.20"  # pragma: no cover
+__version__ = "0.5.21"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -11,6 +11,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from unstructured_inference.logger import logger
+from unstructured_inference.math import safe_division
 from unstructured_inference.models import tesseract
 
 # When extending the boundaries of a PDF object for the purpose of determining which other elements
@@ -118,7 +119,7 @@ def intersection_over_union(self, other: Rectangle) -> float:
         intersection = self.intersection(other)
         intersection_area = 0.0 if intersection is None else intersection.area()
         union_area = self.area() + other.area() - intersection_area
-        return intersection_area / union_area
+        return safe_division(intersection_area, union_area)
 
     def intersection_over_minimum(self, other: Rectangle) -> float:
         """Gives the area-of-intersection over the minimum of the areas of the rectangles. Useful
@@ -127,15 +128,15 @@ def intersection_over_minimum(self, other: Rectangle) -> float:
         intersection = self.intersection(other)
         intersection_area = 0.0 if intersection is None else intersection.area()
         min_area = min(self.area(), other.area())
-        return intersection_area / min_area
+        return safe_division(intersection_area, min_area)
 
     def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float = 0.75) -> bool:
         """Returns whether this region is almost a subregion of other. This is determined by
         comparing the intersection area over self area to some threshold, and checking whether self
         is the smaller rectangle."""
         intersection = self.intersection(other)
         intersection_area = 0.0 if intersection is None else intersection.area()
-        return (subregion_threshold < intersection_area / self.area()) and (
+        return (subregion_threshold < safe_division(intersection_area, self.area())) and (
             self.area() <= other.area()
         )
 
diff --git a/unstructured_inference/math.py b/unstructured_inference/math.py
@@ -0,0 +1,20 @@
+"""a lightweight module that provides helpers to common math operations"""
+
+import numpy as np
+
+FLOAT_EPSILON = np.finfo(float).eps
+
+
+def safe_division(a, b) -> float:
+    """a safer division to avoid division by zero when b == 0
+
+    returns a/b or a/FLOAT_EPSILON (should be around 2.2E-16) when b == 0
+
+    Parameters:
+    - a (int/float): a in a/b
+    - b (int/float): b in a/b
+
+    Returns:
+    float: a/b or a/FLOAT_EPSILON (should be around 2.2E-16) when b == 0
+    """
+    return a / max(b, FLOAT_EPSILON)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.20" # pragma: no cover`
	`1`	`+__version__ = "0.5.21" # pragma: no cover`