Skip to content

Commit 31bef43

Browse files
authored
rfctr: prepare to add orig_elements serde (#2668)
**Summary** The serialization and deserialization (serde) of `metadata.orig_elements` will be located in `unstructured.staging.base` alongside `elements_to_json()` and other existing serde functions. Improve the typing, readability, and structure of that module before adding the new serde functions for `metadata.orig_elements`. **Reviewers:** The commits are well-groomed and are probably quicker to review commit-by-commit than as all files-changed at once.
1 parent 6abfb8b commit 31bef43

File tree

18 files changed

+268
-306
lines changed

18 files changed

+268
-306
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.12.7-dev7
1+
## 0.12.7-dev8
22

33
### Enhancements
44

test_unstructured/cleaners/__init__.py

Whitespace-only changes.

test_unstructured/documents/__init__.py

Whitespace-only changes.

test_unstructured/documents/test_elements.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010

1111
import pytest
1212

13-
from unstructured.cleaners.core import clean_prefix
14-
from unstructured.cleaners.translate import translate_text
13+
from unstructured.cleaners.core import clean_bullets, clean_prefix
1514
from unstructured.documents.coordinates import (
1615
CoordinateSystem,
1716
Orientation,
@@ -66,13 +65,10 @@ def test_text_element_apply_cleaners():
6665

6766

6867
def test_text_element_apply_multiple_cleaners():
69-
cleaners = [
70-
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
71-
partial(translate_text, target_lang="ru"),
72-
]
73-
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
68+
cleaners = [partial(clean_prefix, pattern=r"\[\d{1,2}\]"), partial(clean_bullets)]
69+
text_element = Text(text="[1] \u2022 A Textbook on Crocodile Habitats")
7470
text_element.apply(*cleaners)
75-
assert str(text_element) == "Учебник по крокодильным средам обитания"
71+
assert str(text_element) == "A Textbook on Crocodile Habitats"
7672

7773

7874
def test_apply_raises_if_func_does_not_produce_string():
@@ -82,7 +78,7 @@ def bad_cleaner(s: str):
8278
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
8379

8480
with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
85-
text_element.apply(bad_cleaner) # pyright: ignore[reportGeneralTypeIssues]
81+
text_element.apply(bad_cleaner) # pyright: ignore[reportArgumentType]
8682

8783

8884
@pytest.mark.parametrize(
@@ -241,7 +237,7 @@ class DescribeElementMetadata:
241237

242238
def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):
243239
with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):
244-
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportGeneralTypeIssues]
240+
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportCallIssue]
245241

246242
@pytest.mark.parametrize(
247243
"file_path",
@@ -289,9 +285,9 @@ def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path
289285

290286
def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):
291287
ElementMetadata(
292-
category_depth="2", # pyright: ignore[reportGeneralTypeIssues]
293-
file_directory=True, # pyright: ignore[reportGeneralTypeIssues]
294-
text_as_html=42, # pyright: ignore[reportGeneralTypeIssues]
288+
category_depth="2", # pyright: ignore[reportArgumentType]
289+
file_directory=True, # pyright: ignore[reportArgumentType]
290+
text_as_html=42, # pyright: ignore[reportArgumentType]
295291
)
296292
# -- it does not check types at runtime however (choosing to avoid validation overhead) --
297293

@@ -526,7 +522,7 @@ def it_can_update_itself_from_another_instance(self):
526522
def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):
527523
meta = ElementMetadata()
528524
with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):
529-
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportGeneralTypeIssues]
525+
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportArgumentType]
530526

531527
# -- It knows when it is equal to another instance -------------------------------------------
532528

test_unstructured/embed/__init__.py

Whitespace-only changes.

test_unstructured/file_utils/__init__.py

Whitespace-only changes.

test_unstructured/metrics/__init__.py

Whitespace-only changes.

test_unstructured/partition/utils/__init__.py

Whitespace-only changes.

test_unstructured/staging/__init__.py

Whitespace-only changes.

test_unstructured/staging/test_base_staging.py renamed to test_unstructured/staging/test_base.py

Lines changed: 34 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,9 @@
3131
from unstructured.staging import base
3232

3333

34-
@pytest.fixture()
35-
def output_csv_file(tmp_path):
36-
return os.path.join(tmp_path, "isd_data.csv")
37-
38-
39-
def test_convert_to_isd():
34+
def test_elements_to_dicts():
4035
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
41-
isd = base.convert_to_isd(elements)
36+
isd = base.elements_to_dicts(elements)
4237

4338
assert isd[0]["text"] == "Title 1"
4439
assert isd[0]["type"] == ElementType.TITLE
@@ -47,16 +42,16 @@ def test_convert_to_isd():
4742
assert isd[1]["type"] == "NarrativeText"
4843

4944

50-
def test_isd_to_elements():
51-
isd = [
45+
def test_elements_from_dicts():
46+
element_dicts = [
5247
{"text": "Blurb1", "type": "NarrativeText"},
5348
{"text": "Blurb2", "type": "Title"},
5449
{"text": "Blurb3", "type": "ListItem"},
5550
{"text": "Blurb4", "type": "BulletedText"},
5651
{"text": "No Type"},
5752
]
5853

59-
elements = base.isd_to_elements(isd)
54+
elements = base.elements_from_dicts(element_dicts)
6055
assert elements == [
6156
NarrativeText(text="Blurb1"),
6257
Title(text="Blurb2"),
@@ -65,13 +60,14 @@ def test_isd_to_elements():
6560
]
6661

6762

68-
def test_convert_to_csv(output_csv_file):
63+
def test_convert_to_csv(tmp_path: str):
64+
output_csv_path = os.path.join(tmp_path, "isd_data.csv")
6965
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
70-
with open(output_csv_file, "w+") as csv_file:
66+
with open(output_csv_path, "w+") as csv_file:
7167
isd_csv_string = base.convert_to_csv(elements)
7268
csv_file.write(isd_csv_string)
7369

74-
with open(output_csv_file) as csv_file:
70+
with open(output_csv_path) as csv_file:
7571
csv_rows = csv.DictReader(csv_file)
7672
assert all(set(row.keys()) == set(base.TABLE_FIELDNAMES) for row in csv_rows)
7773

@@ -85,15 +81,13 @@ def test_convert_to_dataframe():
8581
"text": ["Title 1", "Narrative 1"],
8682
},
8783
)
88-
assert df.type.equals(expected_df.type) is True
89-
assert df.text.equals(expected_df.text) is True
84+
assert df.type.equals(expected_df.type) is True # type: ignore
85+
assert df.text.equals(expected_df.text) is True # type: ignore
9086

9187

92-
def test_convert_to_dataframe_maintains_fields(
93-
filename="example-docs/eml/fake-email-attachment.eml",
94-
):
88+
def test_convert_to_dataframe_maintains_fields():
9589
elements = partition_email(
96-
filename=filename,
90+
"example-docs/eml/fake-email-attachment.eml",
9791
process_attachements=True,
9892
regex_metadata={"hello": r"Hello", "punc": r"[!]"},
9993
)
@@ -109,10 +103,7 @@ def test_convert_to_dataframe_maintains_fields(
109103

110104

111105
def test_default_pandas_dtypes():
112-
"""
113-
Make sure that all the values that can exist on an element have a corresponding dtype
114-
mapped in the dict returned by get_default_pandas_dtypes()
115-
"""
106+
"""Ensure all element fields have a dtype in dict returned by get_default_pandas_dtypes()."""
116107
full_element = Text(
117108
text="some text",
118109
element_id="123",
@@ -165,8 +156,7 @@ def test_default_pandas_dtypes():
165156
element_as_dict = full_element.to_dict()
166157
element_as_dict.update(
167158
base.flatten_dict(
168-
element_as_dict.pop("metadata"),
169-
keys_to_omit=["data_source_record_locator"],
159+
element_as_dict.pop("metadata"), keys_to_omit=["data_source_record_locator"]
170160
),
171161
)
172162
flattened_element_keys = element_as_dict.keys()
@@ -180,13 +170,13 @@ def test_default_pandas_dtypes():
180170
platform.system() == "Windows",
181171
reason="Posix Paths are not available on Windows",
182172
)
183-
def test_convert_to_isd_serializes_with_posix_paths():
173+
def test_elements_to_dicts_serializes_with_posix_paths():
184174
metadata = ElementMetadata(filename=pathlib.PosixPath("../../fake-file.txt"))
185175
elements = [
186176
Title(text="Title 1", metadata=metadata),
187177
NarrativeText(text="Narrative 1", metadata=metadata),
188178
]
189-
output = base.convert_to_isd(elements)
179+
output = base.elements_to_dicts(elements)
190180
# NOTE(robinson) - json.dumps should run without raising an exception
191181
json.dumps(output)
192182

@@ -205,11 +195,11 @@ def test_all_elements_preserved_when_serialized():
205195
PageBreak(text=""),
206196
]
207197

208-
isd = base.convert_to_isd(elements)
209-
assert base.convert_to_isd(base.isd_to_elements(isd)) == isd
198+
element_dicts = base.elements_to_dicts(elements)
199+
assert base.elements_to_dicts(base.elements_from_dicts(element_dicts)) == element_dicts
210200

211201

212-
def test_serialized_deserialize_elements_to_json(tmpdir):
202+
def test_serialized_deserialize_elements_to_json(tmpdir: str):
213203
filename = os.path.join(tmpdir, "fake-elements.json")
214204
metadata = ElementMetadata(filename="fake-file.txt")
215205
elements = [
@@ -229,63 +219,38 @@ def test_serialized_deserialize_elements_to_json(tmpdir):
229219
assert elements == new_elements_filename
230220

231221
elements_str = base.elements_to_json(elements)
222+
assert elements_str is not None
232223
new_elements_text = base.elements_from_json(text=elements_str)
233224
assert elements == new_elements_text
234225

235226

236-
def test_read_and_write_json_with_encoding(
237-
filename="example-docs/fake-text-utf-16-be.txt",
238-
):
239-
elements = partition_text(filename=filename)
227+
def test_read_and_write_json_with_encoding():
228+
elements = partition_text("example-docs/fake-text-utf-16-be.txt")
240229
with NamedTemporaryFile() as tempfile:
241230
base.elements_to_json(elements, filename=tempfile.name, encoding="utf-16")
242-
new_elements_filename = base.elements_from_json(
243-
filename=tempfile.name,
244-
encoding="utf-16",
245-
)
231+
new_elements_filename = base.elements_from_json(filename=tempfile.name, encoding="utf-16")
246232
assert elements == new_elements_filename
247233

248234

249-
def test_filter_element_types_with_include_element_type(
250-
filename="example-docs/fake-text.txt",
251-
):
235+
def test_filter_element_types_with_include_element_type():
252236
element_types = [Title]
253-
elements = partition_text(
254-
filename=filename,
255-
include_metadata=False,
256-
)
257-
elements = base.filter_element_types(
258-
elements=elements,
259-
include_element_types=element_types,
260-
)
237+
elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
238+
elements = base.filter_element_types(elements=elements, include_element_types=element_types)
261239
for element in elements:
262240
assert type(element) in element_types
263241

264242

265-
def test_filter_element_types_with_exclude_element_type(
266-
filename="example-docs/fake-text.txt",
267-
):
243+
def test_filter_element_types_with_exclude_element_type():
268244
element_types = [Title]
269-
elements = partition_text(
270-
filename=filename,
271-
include_metadata=False,
272-
)
273-
elements = base.filter_element_types(
274-
elements=elements,
275-
exclude_element_types=element_types,
276-
)
245+
elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
246+
elements = base.filter_element_types(elements=elements, exclude_element_types=element_types)
277247
for element in elements:
278248
assert type(element) not in element_types
279249

280250

281-
def test_filter_element_types_with_exclude_and_include_element_type(
282-
filename="example-docs/fake-text.txt",
283-
):
251+
def test_filter_element_types_with_exclude_and_include_element_type():
284252
element_types = [Title]
285-
elements = partition_text(
286-
filename=filename,
287-
include_metadata=False,
288-
)
253+
elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
289254
with pytest.raises(ValueError):
290255
elements = base.filter_element_types(
291256
elements=elements,
@@ -527,13 +492,9 @@ def test_flatten_dict_flatten_list_omit_keys4():
527492

528493
def test_flatten_empty_dict():
529494
"""Flattening an empty dictionary"""
530-
dictionary = {}
531-
expected_result = {}
532-
assert base.flatten_dict(dictionary) == expected_result
495+
assert base.flatten_dict({}) == {}
533496

534497

535498
def test_flatten_dict_empty_lists():
536499
"""Flattening a dictionary with empty lists"""
537-
dictionary = {"a": [], "b": {"c": []}}
538-
expected_result = {"a": [], "b_c": []}
539-
assert base.flatten_dict(dictionary) == expected_result
500+
assert base.flatten_dict({"a": [], "b": {"c": []}}) == {"a": [], "b_c": []}

0 commit comments

Comments
 (0)