Skip to content

Commit f896f42

Browse files
authored
Merge branch 'main' into pprados/fix_password
2 parents 73b191c + 9835fe4 commit f896f42

File tree

6 files changed

+228
-45
lines changed

6 files changed

+228
-45
lines changed

CHANGELOG.md

Lines changed: 40 additions & 33 deletions
Large diffs are not rendered by default.

test_unstructured/metrics/test_text_extraction.py

Lines changed: 158 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,20 +106,30 @@ def test_calculate_edit_distance():
106106

107107

108108
@pytest.mark.parametrize(
109-
("filename", "expected_score", "expected_distance"),
109+
("filename", "standardize_whitespaces", "expected_score", "expected_distance"),
110110
[
111-
("fake-text.txt", 0.78, 38),
111+
("fake-text.txt", False, 0.78, 38),
112+
("fake-text.txt", True, 0.92, 12),
112113
],
113114
)
114-
def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance):
115+
def test_calculate_edit_distance_with_filename(
116+
filename, standardize_whitespaces, expected_score, expected_distance
117+
):
115118
with open("example-docs/fake-text.txt") as f:
116119
source_cct = f.read()
117120

118121
elements = partition(filename=f"example-docs/{filename}")
119122
output_cct = "\n".join([str(el) for el in elements])
120123

121-
score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
122-
distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")
124+
score = text_extraction.calculate_edit_distance(
125+
output_cct, source_cct, return_as="score", standardize_whitespaces=standardize_whitespaces
126+
)
127+
distance = text_extraction.calculate_edit_distance(
128+
output_cct,
129+
source_cct,
130+
return_as="distance",
131+
standardize_whitespaces=standardize_whitespaces,
132+
)
123133

124134
assert score >= 0
125135
assert score <= 1.0
@@ -128,6 +138,109 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
128138
assert distance == expected_distance
129139

130140

141+
@pytest.mark.parametrize(
142+
("text1", "text2"),
143+
[
144+
(
145+
"The dog\rloved the cat, but\t\n the cat\tloved the\n cow",
146+
"The dog loved the cat, but the cat loved the cow",
147+
),
148+
(
149+
"Hello my\tname\tis H a r p e r, \nwhat's your\vname?",
150+
"Hello my name is H a r p e r, what's your name?",
151+
),
152+
(
153+
"I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
154+
"I have a dog and a cat, I love my dog.",
155+
),
156+
(
157+
"""
158+
Name Age City Occupation
159+
Alice 30 New York Engineer
160+
Bob 25 Los Angeles Designer
161+
Charlie 35 Chicago Teacher
162+
David 40 San Francisco Developer
163+
""",
164+
"""
165+
Name\tAge\tCity\tOccupation
166+
Alice\t30\tNew York\tEngineer
167+
Bob\t25\tLos Angeles\tDesigner
168+
Charlie\t35\tChicago\tTeacher
169+
David\t40\tSan Francisco\tDeveloper
170+
""",
171+
),
172+
(
173+
"""
174+
Name\tAge\tCity\tOccupation
175+
Alice\t30\tNew York\tEngineer
176+
Bob\t25\tLos Angeles\tDesigner
177+
Charlie\t35\tChicago\tTeacher
178+
David\t40\tSan Francisco\tDeveloper
179+
""",
180+
"Name\tAge\tCity\tOccupation\n\n \nAlice\t30\tNew York\tEngineer\nBob\t25\tLos Angeles\tDesigner\nCharlie\t35\tChicago\tTeacher\nDavid\t40\tSan Francisco\tDeveloper", # noqa: E501
181+
),
182+
],
183+
)
184+
def test_calculate_edit_distance_with_various_whitespace_1(text1, text2):
185+
assert (
186+
text_extraction.calculate_edit_distance(
187+
text1, text2, return_as="score", standardize_whitespaces=True
188+
)
189+
== 1.0
190+
)
191+
assert (
192+
text_extraction.calculate_edit_distance(
193+
text1, text2, return_as="distance", standardize_whitespaces=True
194+
)
195+
== 0
196+
)
197+
assert (
198+
text_extraction.calculate_edit_distance(
199+
text1, text2, return_as="score", standardize_whitespaces=False
200+
)
201+
< 1.0
202+
)
203+
assert (
204+
text_extraction.calculate_edit_distance(
205+
text1, text2, return_as="distance", standardize_whitespaces=False
206+
)
207+
> 0
208+
)
209+
210+
211+
def test_calculate_edit_distance_with_various_whitespace_2():
212+
source_cct_tabs = """
213+
Name\tAge\tCity\tOccupation
214+
Alice\t30\tNew York\tEngineer
215+
Bob\t25\tLos Angeles\tDesigner
216+
Charlie\t35\tChicago\tTeacher
217+
David\t40\tSan Francisco\tDeveloper
218+
"""
219+
source_cct_with_borders = """
220+
221+
| Name | Age | City | Occupation |
222+
|---------|-----|--------------|----------------|
223+
| Alice | 30 | New York | Engineer |
224+
| Bob | 25 | Los Angeles | Designer |
225+
| Charlie | 35 | Chicago | Teacher |
226+
| David | 40 | San Francisco| Developer |
227+
228+
"""
229+
assert text_extraction.calculate_edit_distance(
230+
source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=True
231+
) > text_extraction.calculate_edit_distance(
232+
source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=False
233+
)
234+
assert text_extraction.calculate_edit_distance(
235+
source_cct_tabs, source_cct_with_borders, return_as="distance", standardize_whitespaces=True
236+
) < text_extraction.calculate_edit_distance(
237+
source_cct_tabs,
238+
source_cct_with_borders,
239+
return_as="distance",
240+
standardize_whitespaces=False,
241+
)
242+
243+
131244
@pytest.mark.parametrize(
132245
("text", "expected"),
133246
[
@@ -187,6 +300,46 @@ def test_bag_of_words(text, expected):
187300
assert text_extraction.bag_of_words(text) == expected
188301

189302

303+
@pytest.mark.parametrize(
304+
("text", "expected"),
305+
[
306+
(
307+
"The dog\rloved the cat, but\t\n the cat\tloved the\n cow\n\n",
308+
"The dog loved the cat, but the cat loved the cow",
309+
),
310+
(
311+
"\n\nHello my\tname\tis H a r p e r, \nwhat's your\vname?",
312+
"Hello my name is H a r p e r, what's your name?",
313+
),
314+
(
315+
"I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
316+
"I have a dog and a cat, I love my dog.",
317+
),
318+
(
319+
"""L is for the way you look at me
320+
O is for the only one I see
321+
V is very, very extraordinary
322+
E is even more than anyone that you adore can""",
323+
"L is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can", # noqa: E501
324+
),
325+
(
326+
"""
327+
| Name | Age | City | Occupation |
328+
|---------|-----|--------------|----------------|
329+
| Alice | 30 | New York | Engineer |
330+
| Bob | 25 | Los Angeles | Designer |
331+
| Charlie | 35 | Chicago | Teacher |
332+
| David | 40 | San Francisco| Developer |
333+
""",
334+
"| Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer |", # noqa: E501
335+
),
336+
],
337+
)
338+
def test_prepare_string(text, expected):
339+
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
340+
assert text_extraction.prepare_str(text) == text
341+
342+
190343
@pytest.mark.parametrize(
191344
("output_text", "source_text", "expected_percentage"),
192345
[

test_unstructured/partition/test_api.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,25 @@ def test_retries_config_none_parameters_return_empty_config():
246246
assert retries_config is None
247247

248248

249+
def test_retry_config_with_empty_sdk_retry_config_returns_default():
250+
sdk = Mock()
251+
sdk.sdk_configuration.retry_config = None
252+
retries_config = get_retries_config(
253+
retries_connection_errors=True,
254+
retries_exponent=1.88,
255+
retries_initial_interval=3000,
256+
retries_max_elapsed_time=None,
257+
retries_max_interval=None,
258+
sdk=sdk,
259+
)
260+
261+
assert retries_config.retry_connection_errors
262+
assert retries_config.backoff.exponent == 1.88
263+
assert retries_config.backoff.initial_interval == 3000
264+
assert retries_config.backoff.max_elapsed_time == DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC
265+
assert retries_config.backoff.max_interval == DEFAULT_RETRIES_MAX_INTERVAL_SEC
266+
267+
249268
def test_retries_config_with_no_parameters_set():
250269
retry_config = retries.RetryConfig(
251270
"backoff", retries.BackoffStrategy(3000, 720000, 1.88, 1800000), True

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.2-dev" # pragma: no cover
1+
__version__ = "0.16.3-dev" # pragma: no cover

unstructured/metrics/text_extraction.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def calculate_edit_distance(
2222
source: Optional[str],
2323
weights: Tuple[int, int, int] = (2, 1, 1),
2424
return_as: str = "distance",
25+
standardize_whitespaces: bool = True,
2526
) -> float:
2627
"""
2728
Calculates edit distance using Levenshtein distance between two strings.
@@ -56,8 +57,8 @@ def calculate_edit_distance(
5657
return_types = ["score", "distance"]
5758
if return_as not in return_types:
5859
raise ValueError("Invalid return value type. Expected one of: %s" % return_types)
59-
output = _prepare_str(output)
60-
source = _prepare_str(source)
60+
output = prepare_str(output, standardize_whitespaces)
61+
source = prepare_str(source, standardize_whitespaces)
6162
distance = Levenshtein.distance(output, source, weights=weights) # type: ignore
6263
# lower bounded the char length for source string at 1.0 because to avoid division by zero
6364
# in the case where source string is empty, the distance should be at 100%
@@ -127,8 +128,8 @@ def calculate_percent_missing_text(
127128
128129
Returns the percentage of missing text represented as a decimal between 0 and 1.
129130
"""
130-
output = _prepare_str(output)
131-
source = _prepare_str(source)
131+
output = prepare_str(output)
132+
source = prepare_str(source)
132133
output_bow = bag_of_words(output)
133134
source_bow = bag_of_words(source)
134135

@@ -153,7 +154,9 @@ def calculate_percent_missing_text(
153154
return min(fraction_missing, 1) # limit to 100%
154155

155156

156-
def _prepare_str(string: Optional[str]) -> str:
157+
def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) -> str:
157158
if not string:
158159
return ""
160+
if standardize_whitespaces:
161+
return " ".join(string.split())
159162
return str(string) # type: ignore

unstructured/partition/api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,8 @@ def get_backoff_default(setting_name: str, default_value: Any) -> Any:
195195

196196
default_retries_connneciton_errors = (
197197
sdk_default_retries_config.retry_connection_errors
198-
if sdk_default_retries_config.retry_connection_errors is not None
198+
if sdk_default_retries_config
199+
and sdk_default_retries_config.retry_connection_errors is not None
199200
else DEFAULT_RETRIES_CONNECTION_ERRORS
200201
)
201202

0 commit comments

Comments
 (0)