Skip to content

Commit e931bef

Browse files
authored
add protection to division by 0 (#193)
This PR resolves #192 by adding a helper function to perform division `a/b` and replace `b` with machine precision for float when `b==0` to avoid division by 0. ## further discussion Do we want to also have the division be type conscious (do not auto convert integers into float, i.e., if int/int we should expect int as output?)
1 parent 351dbcf commit e931bef

File tree

5 files changed

+42
-4
lines changed

5 files changed

+42
-4
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.5.21
2+
3+
* adds `safe_division` to replae 0 with machine epsilon for `float` to avoid division by 0
4+
* apply `safe_division` to area overlap calculations in `unstructured_inference/inference/elements.py`
5+
16
## 0.5.20
27

38
* Adds YoloX quantized model
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import numpy as np
2+
import pytest
3+
4+
from unstructured_inference.math import FLOAT_EPSILON, safe_division
5+
6+
7+
@pytest.mark.parametrize(
8+
("a", "b", "expected"),
9+
[(0, 0, 0), (0, 1, 0), (1, 0, np.round(1 / FLOAT_EPSILON, 1)), (2, 3, 0.7)],
10+
)
11+
def test_safe_division(a, b, expected):
12+
assert np.round(safe_division(a, b), 1) == expected
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.20" # pragma: no cover
1+
__version__ = "0.5.21" # pragma: no cover

unstructured_inference/inference/elements.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from scipy.sparse.csgraph import connected_components
1212

1313
from unstructured_inference.logger import logger
14+
from unstructured_inference.math import safe_division
1415
from unstructured_inference.models import tesseract
1516

1617
# When extending the boundaries of a PDF object for the purpose of determining which other elements
@@ -118,7 +119,7 @@ def intersection_over_union(self, other: Rectangle) -> float:
118119
intersection = self.intersection(other)
119120
intersection_area = 0.0 if intersection is None else intersection.area()
120121
union_area = self.area() + other.area() - intersection_area
121-
return intersection_area / union_area
122+
return safe_division(intersection_area, union_area)
122123

123124
def intersection_over_minimum(self, other: Rectangle) -> float:
124125
"""Gives the area-of-intersection over the minimum of the areas of the rectangles. Useful
@@ -127,15 +128,15 @@ def intersection_over_minimum(self, other: Rectangle) -> float:
127128
intersection = self.intersection(other)
128129
intersection_area = 0.0 if intersection is None else intersection.area()
129130
min_area = min(self.area(), other.area())
130-
return intersection_area / min_area
131+
return safe_division(intersection_area, min_area)
131132

132133
def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float = 0.75) -> bool:
133134
"""Returns whether this region is almost a subregion of other. This is determined by
134135
comparing the intersection area over self area to some threshold, and checking whether self
135136
is the smaller rectangle."""
136137
intersection = self.intersection(other)
137138
intersection_area = 0.0 if intersection is None else intersection.area()
138-
return (subregion_threshold < intersection_area / self.area()) and (
139+
return (subregion_threshold < safe_division(intersection_area, self.area())) and (
139140
self.area() <= other.area()
140141
)
141142

unstructured_inference/math.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""a lightweight module that provides helpers to common math operations"""
2+
3+
import numpy as np
4+
5+
FLOAT_EPSILON = np.finfo(float).eps
6+
7+
8+
def safe_division(a, b) -> float:
9+
"""a safer division to avoid division by zero when b == 0
10+
11+
returns a/b or a/FLOAT_EPSILON (should be around 2.2E-16) when b == 0
12+
13+
Parameters:
14+
- a (int/float): a in a/b
15+
- b (int/float): b in a/b
16+
17+
Returns:
18+
float: a/b or a/FLOAT_EPSILON (should be around 2.2E-16) when b == 0
19+
"""
20+
return a / max(b, FLOAT_EPSILON)

0 commit comments

Comments
 (0)