3131from unstructured .staging import base
3232
3333
34- @pytest .fixture ()
35- def output_csv_file (tmp_path ):
36- return os .path .join (tmp_path , "isd_data.csv" )
37-
38-
39- def test_convert_to_isd ():
34+ def test_elements_to_dicts ():
4035 elements = [Title (text = "Title 1" ), NarrativeText (text = "Narrative 1" )]
41- isd = base .convert_to_isd (elements )
36+ isd = base .elements_to_dicts (elements )
4237
4338 assert isd [0 ]["text" ] == "Title 1"
4439 assert isd [0 ]["type" ] == ElementType .TITLE
@@ -47,16 +42,16 @@ def test_convert_to_isd():
4742 assert isd [1 ]["type" ] == "NarrativeText"
4843
4944
50- def test_isd_to_elements ():
51- isd = [
45+ def test_elements_from_dicts ():
46+ element_dicts = [
5247 {"text" : "Blurb1" , "type" : "NarrativeText" },
5348 {"text" : "Blurb2" , "type" : "Title" },
5449 {"text" : "Blurb3" , "type" : "ListItem" },
5550 {"text" : "Blurb4" , "type" : "BulletedText" },
5651 {"text" : "No Type" },
5752 ]
5853
59- elements = base .isd_to_elements ( isd )
54+ elements = base .elements_from_dicts ( element_dicts )
6055 assert elements == [
6156 NarrativeText (text = "Blurb1" ),
6257 Title (text = "Blurb2" ),
@@ -65,13 +60,14 @@ def test_isd_to_elements():
6560 ]
6661
6762
68- def test_convert_to_csv (output_csv_file ):
63+ def test_convert_to_csv (tmp_path : str ):
64+ output_csv_path = os .path .join (tmp_path , "isd_data.csv" )
6965 elements = [Title (text = "Title 1" ), NarrativeText (text = "Narrative 1" )]
70- with open (output_csv_file , "w+" ) as csv_file :
66+ with open (output_csv_path , "w+" ) as csv_file :
7167 isd_csv_string = base .convert_to_csv (elements )
7268 csv_file .write (isd_csv_string )
7369
74- with open (output_csv_file ) as csv_file :
70+ with open (output_csv_path ) as csv_file :
7571 csv_rows = csv .DictReader (csv_file )
7672 assert all (set (row .keys ()) == set (base .TABLE_FIELDNAMES ) for row in csv_rows )
7773
@@ -85,15 +81,13 @@ def test_convert_to_dataframe():
8581 "text" : ["Title 1" , "Narrative 1" ],
8682 },
8783 )
88- assert df .type .equals (expected_df .type ) is True
89- assert df .text .equals (expected_df .text ) is True
84+ assert df .type .equals (expected_df .type ) is True # type: ignore
85+ assert df .text .equals (expected_df .text ) is True # type: ignore
9086
9187
92- def test_convert_to_dataframe_maintains_fields (
93- filename = "example-docs/eml/fake-email-attachment.eml" ,
94- ):
88+ def test_convert_to_dataframe_maintains_fields ():
9589 elements = partition_email (
96- filename = filename ,
90+ "example-docs/eml/fake-email-attachment.eml" ,
9791 process_attachements = True ,
9892 regex_metadata = {"hello" : r"Hello" , "punc" : r"[!]" },
9993 )
@@ -109,10 +103,7 @@ def test_convert_to_dataframe_maintains_fields(
109103
110104
111105def test_default_pandas_dtypes ():
112- """
113- Make sure that all the values that can exist on an element have a corresponding dtype
114- mapped in the dict returned by get_default_pandas_dtypes()
115- """
106+ """Ensure all element fields have a dtype in dict returned by get_default_pandas_dtypes()."""
116107 full_element = Text (
117108 text = "some text" ,
118109 element_id = "123" ,
@@ -165,8 +156,7 @@ def test_default_pandas_dtypes():
165156 element_as_dict = full_element .to_dict ()
166157 element_as_dict .update (
167158 base .flatten_dict (
168- element_as_dict .pop ("metadata" ),
169- keys_to_omit = ["data_source_record_locator" ],
159+ element_as_dict .pop ("metadata" ), keys_to_omit = ["data_source_record_locator" ]
170160 ),
171161 )
172162 flattened_element_keys = element_as_dict .keys ()
@@ -180,13 +170,13 @@ def test_default_pandas_dtypes():
180170 platform .system () == "Windows" ,
181171 reason = "Posix Paths are not available on Windows" ,
182172)
183- def test_convert_to_isd_serializes_with_posix_paths ():
173+ def test_elements_to_dicts_serializes_with_posix_paths ():
184174 metadata = ElementMetadata (filename = pathlib .PosixPath ("../../fake-file.txt" ))
185175 elements = [
186176 Title (text = "Title 1" , metadata = metadata ),
187177 NarrativeText (text = "Narrative 1" , metadata = metadata ),
188178 ]
189- output = base .convert_to_isd (elements )
179+ output = base .elements_to_dicts (elements )
190180 # NOTE(robinson) - json.dumps should run without raising an exception
191181 json .dumps (output )
192182
@@ -205,11 +195,11 @@ def test_all_elements_preserved_when_serialized():
205195 PageBreak (text = "" ),
206196 ]
207197
208- isd = base .convert_to_isd (elements )
209- assert base .convert_to_isd (base .isd_to_elements ( isd )) == isd
198+ element_dicts = base .elements_to_dicts (elements )
199+ assert base .elements_to_dicts (base .elements_from_dicts ( element_dicts )) == element_dicts
210200
211201
212- def test_serialized_deserialize_elements_to_json (tmpdir ):
202+ def test_serialized_deserialize_elements_to_json (tmpdir : str ):
213203 filename = os .path .join (tmpdir , "fake-elements.json" )
214204 metadata = ElementMetadata (filename = "fake-file.txt" )
215205 elements = [
@@ -229,63 +219,38 @@ def test_serialized_deserialize_elements_to_json(tmpdir):
229219 assert elements == new_elements_filename
230220
231221 elements_str = base .elements_to_json (elements )
222+ assert elements_str is not None
232223 new_elements_text = base .elements_from_json (text = elements_str )
233224 assert elements == new_elements_text
234225
235226
236- def test_read_and_write_json_with_encoding (
237- filename = "example-docs/fake-text-utf-16-be.txt" ,
238- ):
239- elements = partition_text (filename = filename )
227+ def test_read_and_write_json_with_encoding ():
228+ elements = partition_text ("example-docs/fake-text-utf-16-be.txt" )
240229 with NamedTemporaryFile () as tempfile :
241230 base .elements_to_json (elements , filename = tempfile .name , encoding = "utf-16" )
242- new_elements_filename = base .elements_from_json (
243- filename = tempfile .name ,
244- encoding = "utf-16" ,
245- )
231+ new_elements_filename = base .elements_from_json (filename = tempfile .name , encoding = "utf-16" )
246232 assert elements == new_elements_filename
247233
248234
249- def test_filter_element_types_with_include_element_type (
250- filename = "example-docs/fake-text.txt" ,
251- ):
235+ def test_filter_element_types_with_include_element_type ():
252236 element_types = [Title ]
253- elements = partition_text (
254- filename = filename ,
255- include_metadata = False ,
256- )
257- elements = base .filter_element_types (
258- elements = elements ,
259- include_element_types = element_types ,
260- )
237+ elements = partition_text ("example-docs/fake-text.txt" , include_metadata = False )
238+ elements = base .filter_element_types (elements = elements , include_element_types = element_types )
261239 for element in elements :
262240 assert type (element ) in element_types
263241
264242
265- def test_filter_element_types_with_exclude_element_type (
266- filename = "example-docs/fake-text.txt" ,
267- ):
243+ def test_filter_element_types_with_exclude_element_type ():
268244 element_types = [Title ]
269- elements = partition_text (
270- filename = filename ,
271- include_metadata = False ,
272- )
273- elements = base .filter_element_types (
274- elements = elements ,
275- exclude_element_types = element_types ,
276- )
245+ elements = partition_text ("example-docs/fake-text.txt" , include_metadata = False )
246+ elements = base .filter_element_types (elements = elements , exclude_element_types = element_types )
277247 for element in elements :
278248 assert type (element ) not in element_types
279249
280250
281- def test_filter_element_types_with_exclude_and_include_element_type (
282- filename = "example-docs/fake-text.txt" ,
283- ):
251+ def test_filter_element_types_with_exclude_and_include_element_type ():
284252 element_types = [Title ]
285- elements = partition_text (
286- filename = filename ,
287- include_metadata = False ,
288- )
253+ elements = partition_text ("example-docs/fake-text.txt" , include_metadata = False )
289254 with pytest .raises (ValueError ):
290255 elements = base .filter_element_types (
291256 elements = elements ,
@@ -527,13 +492,9 @@ def test_flatten_dict_flatten_list_omit_keys4():
527492
528493def test_flatten_empty_dict ():
529494 """Flattening an empty dictionary"""
530- dictionary = {}
531- expected_result = {}
532- assert base .flatten_dict (dictionary ) == expected_result
495+ assert base .flatten_dict ({}) == {}
533496
534497
535498def test_flatten_dict_empty_lists ():
536499 """Flattening a dictionary with empty lists"""
537- dictionary = {"a" : [], "b" : {"c" : []}}
538- expected_result = {"a" : [], "b_c" : []}
539- assert base .flatten_dict (dictionary ) == expected_result
500+ assert base .flatten_dict ({"a" : [], "b" : {"c" : []}}) == {"a" : [], "b_c" : []}
0 commit comments