Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test_unstructured/cleaners/__init__.py‎ b/‎test_unstructured/cleaners/__init__.py‎
diff --git a/‎test_unstructured/documents/__init__.py‎ b/‎test_unstructured/documents/__init__.py‎
diff --git a/‎test_unstructured/documents/test_elements.py‎
Lines changed: 10 additions & 14 deletions b/‎test_unstructured/documents/test_elements.py‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎test_unstructured/embed/__init__.py‎ b/‎test_unstructured/embed/__init__.py‎
diff --git a/‎test_unstructured/file_utils/__init__.py‎ b/‎test_unstructured/file_utils/__init__.py‎
diff --git a/‎test_unstructured/metrics/__init__.py‎ b/‎test_unstructured/metrics/__init__.py‎
diff --git a/‎test_unstructured/partition/utils/__init__.py‎ b/‎test_unstructured/partition/utils/__init__.py‎
diff --git a/‎test_unstructured/staging/__init__.py‎ b/‎test_unstructured/staging/__init__.py‎
diff --git a/‎…nstructured/staging/test_base_staging.py‎ ‎test_unstructured/staging/test_base.py‎test_unstructured/staging/test_base_staging.py renamed to test_unstructured/staging/test_base.py
Lines changed: 34 additions & 73 deletions b/‎…nstructured/staging/test_base_staging.py‎ ‎test_unstructured/staging/test_base.py‎test_unstructured/staging/test_base_staging.py renamed to test_unstructured/staging/test_base.py
Lines changed: 34 additions & 73 deletions
@@ -1,4 +1,4 @@
-## 0.12.7-dev7
+## 0.12.7-dev8
 
 ### Enhancements
 
 
@@ -10,8 +10,7 @@
 
 import pytest
 
-from unstructured.cleaners.core import clean_prefix
-from unstructured.cleaners.translate import translate_text
+from unstructured.cleaners.core import clean_bullets, clean_prefix
 from unstructured.documents.coordinates import (
     CoordinateSystem,
     Orientation,
@@ -66,13 +65,10 @@ def test_text_element_apply_cleaners():
 
 
 def test_text_element_apply_multiple_cleaners():
-    cleaners = [
-        partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
-        partial(translate_text, target_lang="ru"),
-    ]
-    text_element = Text(text="[1] A Textbook on Crocodile Habitats")
+    cleaners = [partial(clean_prefix, pattern=r"\[\d{1,2}\]"), partial(clean_bullets)]
+    text_element = Text(text="[1] \u2022 A Textbook on Crocodile Habitats")
     text_element.apply(*cleaners)
-    assert str(text_element) == "Учебник по крокодильным средам обитания"
+    assert str(text_element) == "A Textbook on Crocodile Habitats"
 
 
 def test_apply_raises_if_func_does_not_produce_string():
@@ -82,7 +78,7 @@ def bad_cleaner(s: str):
     text_element = Text(text="[1] A Textbook on Crocodile Habitats")
 
     with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
-        text_element.apply(bad_cleaner)  # pyright: ignore[reportGeneralTypeIssues]
+        text_element.apply(bad_cleaner)  # pyright: ignore[reportArgumentType]
 
 
 @pytest.mark.parametrize(
@@ -241,7 +237,7 @@ class DescribeElementMetadata:
 
     def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):
         with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):
-            ElementMetadata(file_name="memo.docx")  # pyright: ignore[reportGeneralTypeIssues]
+            ElementMetadata(file_name="memo.docx")  # pyright: ignore[reportCallIssue]
 
     @pytest.mark.parametrize(
         "file_path",
@@ -289,9 +285,9 @@ def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path
 
     def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):
         ElementMetadata(
-            category_depth="2",  # pyright: ignore[reportGeneralTypeIssues]
-            file_directory=True,  # pyright: ignore[reportGeneralTypeIssues]
-            text_as_html=42,  # pyright: ignore[reportGeneralTypeIssues]
+            category_depth="2",  # pyright: ignore[reportArgumentType]
+            file_directory=True,  # pyright: ignore[reportArgumentType]
+            text_as_html=42,  # pyright: ignore[reportArgumentType]
         )
         # -- it does not check types at runtime however (choosing to avoid validation overhead) --
 
@@ -526,7 +522,7 @@ def it_can_update_itself_from_another_instance(self):
     def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):
         meta = ElementMetadata()
         with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):
-            meta.update({"coefficient": "0.56"})  # pyright: ignore[reportGeneralTypeIssues]
+            meta.update({"coefficient": "0.56"})  # pyright: ignore[reportArgumentType]
 
     # -- It knows when it is equal to another instance -------------------------------------------
 
 
@@ -31,14 +31,9 @@
 from unstructured.staging import base
 
 
-@pytest.fixture()
-def output_csv_file(tmp_path):
-    return os.path.join(tmp_path, "isd_data.csv")
-
-
-def test_convert_to_isd():
+def test_elements_to_dicts():
     elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
-    isd = base.convert_to_isd(elements)
+    isd = base.elements_to_dicts(elements)
 
     assert isd[0]["text"] == "Title 1"
     assert isd[0]["type"] == ElementType.TITLE
@@ -47,16 +42,16 @@ def test_convert_to_isd():
     assert isd[1]["type"] == "NarrativeText"
 
 
-def test_isd_to_elements():
-    isd = [
+def test_elements_from_dicts():
+    element_dicts = [
         {"text": "Blurb1", "type": "NarrativeText"},
         {"text": "Blurb2", "type": "Title"},
         {"text": "Blurb3", "type": "ListItem"},
         {"text": "Blurb4", "type": "BulletedText"},
         {"text": "No Type"},
     ]
 
-    elements = base.isd_to_elements(isd)
+    elements = base.elements_from_dicts(element_dicts)
     assert elements == [
         NarrativeText(text="Blurb1"),
         Title(text="Blurb2"),
@@ -65,13 +60,14 @@ def test_isd_to_elements():
     ]
 
 
-def test_convert_to_csv(output_csv_file):
+def test_convert_to_csv(tmp_path: str):
+    output_csv_path = os.path.join(tmp_path, "isd_data.csv")
     elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
-    with open(output_csv_file, "w+") as csv_file:
+    with open(output_csv_path, "w+") as csv_file:
         isd_csv_string = base.convert_to_csv(elements)
         csv_file.write(isd_csv_string)
 
-    with open(output_csv_file) as csv_file:
+    with open(output_csv_path) as csv_file:
         csv_rows = csv.DictReader(csv_file)
         assert all(set(row.keys()) == set(base.TABLE_FIELDNAMES) for row in csv_rows)
 
@@ -85,15 +81,13 @@ def test_convert_to_dataframe():
             "text": ["Title 1", "Narrative 1"],
         },
     )
-    assert df.type.equals(expected_df.type) is True
-    assert df.text.equals(expected_df.text) is True
+    assert df.type.equals(expected_df.type) is True  # type: ignore
+    assert df.text.equals(expected_df.text) is True  # type: ignore
 
 
-def test_convert_to_dataframe_maintains_fields(
-    filename="example-docs/eml/fake-email-attachment.eml",
-):
+def test_convert_to_dataframe_maintains_fields():
     elements = partition_email(
-        filename=filename,
+        "example-docs/eml/fake-email-attachment.eml",
         process_attachements=True,
         regex_metadata={"hello": r"Hello", "punc": r"[!]"},
     )
@@ -109,10 +103,7 @@ def test_convert_to_dataframe_maintains_fields(
 
 
 def test_default_pandas_dtypes():
-    """
-    Make sure that all the values that can exist on an element have a corresponding dtype
-    mapped in the dict returned by get_default_pandas_dtypes()
-    """
+    """Ensure all element fields have a dtype in dict returned by get_default_pandas_dtypes()."""
     full_element = Text(
         text="some text",
         element_id="123",
@@ -165,8 +156,7 @@ def test_default_pandas_dtypes():
     element_as_dict = full_element.to_dict()
     element_as_dict.update(
         base.flatten_dict(
-            element_as_dict.pop("metadata"),
-            keys_to_omit=["data_source_record_locator"],
+            element_as_dict.pop("metadata"), keys_to_omit=["data_source_record_locator"]
         ),
     )
     flattened_element_keys = element_as_dict.keys()
@@ -180,13 +170,13 @@ def test_default_pandas_dtypes():
     platform.system() == "Windows",
     reason="Posix Paths are not available on Windows",
 )
-def test_convert_to_isd_serializes_with_posix_paths():
+def test_elements_to_dicts_serializes_with_posix_paths():
     metadata = ElementMetadata(filename=pathlib.PosixPath("../../fake-file.txt"))
     elements = [
         Title(text="Title 1", metadata=metadata),
         NarrativeText(text="Narrative 1", metadata=metadata),
     ]
-    output = base.convert_to_isd(elements)
+    output = base.elements_to_dicts(elements)
     # NOTE(robinson) - json.dumps should run without raising an exception
     json.dumps(output)
 
@@ -205,11 +195,11 @@ def test_all_elements_preserved_when_serialized():
         PageBreak(text=""),
     ]
 
-    isd = base.convert_to_isd(elements)
-    assert base.convert_to_isd(base.isd_to_elements(isd)) == isd
+    element_dicts = base.elements_to_dicts(elements)
+    assert base.elements_to_dicts(base.elements_from_dicts(element_dicts)) == element_dicts
 
 
-def test_serialized_deserialize_elements_to_json(tmpdir):
+def test_serialized_deserialize_elements_to_json(tmpdir: str):
     filename = os.path.join(tmpdir, "fake-elements.json")
     metadata = ElementMetadata(filename="fake-file.txt")
     elements = [
@@ -229,63 +219,38 @@ def test_serialized_deserialize_elements_to_json(tmpdir):
     assert elements == new_elements_filename
 
     elements_str = base.elements_to_json(elements)
+    assert elements_str is not None
     new_elements_text = base.elements_from_json(text=elements_str)
     assert elements == new_elements_text
 
 
-def test_read_and_write_json_with_encoding(
-    filename="example-docs/fake-text-utf-16-be.txt",
-):
-    elements = partition_text(filename=filename)
+def test_read_and_write_json_with_encoding():
+    elements = partition_text("example-docs/fake-text-utf-16-be.txt")
     with NamedTemporaryFile() as tempfile:
         base.elements_to_json(elements, filename=tempfile.name, encoding="utf-16")
-        new_elements_filename = base.elements_from_json(
-            filename=tempfile.name,
-            encoding="utf-16",
-        )
+        new_elements_filename = base.elements_from_json(filename=tempfile.name, encoding="utf-16")
     assert elements == new_elements_filename
 
 
-def test_filter_element_types_with_include_element_type(
-    filename="example-docs/fake-text.txt",
-):
+def test_filter_element_types_with_include_element_type():
     element_types = [Title]
-    elements = partition_text(
-        filename=filename,
-        include_metadata=False,
-    )
-    elements = base.filter_element_types(
-        elements=elements,
-        include_element_types=element_types,
-    )
+    elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
+    elements = base.filter_element_types(elements=elements, include_element_types=element_types)
     for element in elements:
         assert type(element) in element_types
 
 
-def test_filter_element_types_with_exclude_element_type(
-    filename="example-docs/fake-text.txt",
-):
+def test_filter_element_types_with_exclude_element_type():
     element_types = [Title]
-    elements = partition_text(
-        filename=filename,
-        include_metadata=False,
-    )
-    elements = base.filter_element_types(
-        elements=elements,
-        exclude_element_types=element_types,
-    )
+    elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
+    elements = base.filter_element_types(elements=elements, exclude_element_types=element_types)
     for element in elements:
         assert type(element) not in element_types
 
 
-def test_filter_element_types_with_exclude_and_include_element_type(
-    filename="example-docs/fake-text.txt",
-):
+def test_filter_element_types_with_exclude_and_include_element_type():
     element_types = [Title]
-    elements = partition_text(
-        filename=filename,
-        include_metadata=False,
-    )
+    elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
     with pytest.raises(ValueError):
         elements = base.filter_element_types(
             elements=elements,
@@ -527,13 +492,9 @@ def test_flatten_dict_flatten_list_omit_keys4():
 
 def test_flatten_empty_dict():
     """Flattening an empty dictionary"""
-    dictionary = {}
-    expected_result = {}
-    assert base.flatten_dict(dictionary) == expected_result
+    assert base.flatten_dict({}) == {}
 
 
 def test_flatten_dict_empty_lists():
     """Flattening a dictionary with empty lists"""
-    dictionary = {"a": [], "b": {"c": []}}
-    expected_result = {"a": [], "b_c": []}
-    assert base.flatten_dict(dictionary) == expected_result
+    assert base.flatten_dict({"a": [], "b": {"c": []}}) == {"a": [], "b_c": []}
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-## 0.12.7-dev7`
	`1`	`+## 0.12.7-dev8`
`2`	`2`
`3`	`3`	`### Enhancements`
`4`	`4`