feat: Support encoding parameter in partition_csv (#3564)

awalker4 · web-flow · commit f440eb476cf7 · 2024-08-28T14:19:58.000Z
See added test file. Added support for the encoding parameter, which can
be passed directly to `pd.read_csv`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,11 @@
-## 0.15.9-dev0
+## 0.15.9-dev1
 
 ### Enhancements
 
 ### Features
 
+* **Add support for encoding parameter in partition_csv**
+
 ### Fixes
 
 * **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
diff --git a/example-docs/stanley-cups-utf-16.csv b/example-docs/stanley-cups-utf-16.csv
diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py
@@ -74,6 +74,12 @@ def test_partition_csv_from_filename_with_metadata_filename():
     assert elements[0].metadata.filename == "test"
 
 
+def test_partition_csv_with_encoding():
+    elements = partition_csv(example_doc_path("stanley-cups-utf-16.csv"), encoding="utf-16")
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+
+
 @pytest.mark.parametrize(
     ("filename", "expected_text", "expected_table"),
     [
@@ -279,6 +285,7 @@ def it_provides_a_validating_alternate_constructor(self):
         ctx = _CsvPartitioningContext.load(
             file_path=example_doc_path("stanley-cups.csv"),
             file=None,
+            encoding=None,
             metadata_file_path=None,
             metadata_last_modified=None,
             include_header=True,
@@ -292,6 +299,7 @@ def and_the_validating_constructor_raises_on_an_invalid_context(self):
             _CsvPartitioningContext.load(
                 file_path=None,
                 file=None,
+                encoding=None,
                 metadata_file_path=None,
                 metadata_last_modified=None,
                 include_header=True,
diff --git a/typings/pandas/io/parsers/readers.pyi b/typings/pandas/io/parsers/readers.pyi
@@ -7,6 +7,7 @@ from pandas.core.frame import DataFrame
 def read_csv(
     filepath_or_buffer: str | IO[bytes],
     *,
+    encoding: str | None = ...,
     sep: str | None = ...,
     header: int | None | Literal["infer"] = ...,
 ) -> DataFrame: ...
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.9-dev0"  # pragma: no cover
+__version__ = "0.15.9-dev1"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -207,6 +207,7 @@ def partition(
         elements = partition_csv(
             filename=filename,
             file=file,
+            encoding=encoding,
             infer_table_structure=infer_table_structure,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
@@ -29,6 +29,7 @@
 def partition_csv(
     filename: str | None = None,
     file: IO[bytes] | None = None,
+    encoding: str | None = None,
     metadata_filename: str | None = None,
     metadata_last_modified: str | None = None,
     include_header: bool = False,
@@ -47,6 +48,8 @@ def partition_csv(
         A string defining the target filename path.
     file
         A file-like object using "rb" mode --> open(filename, "rb").
+    encoding
+        The encoding method used to decode the text input. If None, utf-8 will be used.
     metadata_filename
         The filename to use for the metadata.
     metadata_last_modified
@@ -73,6 +76,7 @@ def partition_csv(
     ctx = _CsvPartitioningContext(
         file_path=filename,
         file=file,
+        encoding=encoding,
         metadata_file_path=metadata_filename,
         metadata_last_modified=metadata_last_modified,
         include_header=include_header,
@@ -81,7 +85,7 @@ def partition_csv(
     )
 
     with ctx.open() as file:
-        dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
+        dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)
 
     html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
     text = soupparser_fromstring(html_text).text_content()
@@ -110,6 +114,7 @@ def __init__(
         self,
         file_path: str | None = None,
         file: IO[bytes] | None = None,
+        encoding: str | None = None,
         metadata_file_path: str | None = None,
         metadata_last_modified: str | None = None,
         include_header: bool = False,
@@ -118,6 +123,7 @@ def __init__(
     ):
         self._file_path = file_path
         self._file = file
+        self._encoding = encoding
         self._metadata_file_path = metadata_file_path
         self._metadata_last_modified = metadata_last_modified
         self._include_header = include_header
@@ -129,6 +135,7 @@ def load(
         cls,
         file_path: str | None,
         file: IO[bytes] | None,
+        encoding: str | None,
         metadata_file_path: str | None,
         metadata_last_modified: str | None,
         include_header: bool,
@@ -138,6 +145,7 @@ def load(
         return cls(
             file_path=file_path,
             file=file,
+            encoding=encoding,
             metadata_file_path=metadata_file_path,
             metadata_last_modified=metadata_last_modified,
             include_header=include_header,
@@ -156,7 +164,9 @@ def delimiter(self) -> str | None:
 
         with self.open() as file:
             # -- read whole lines, sniffer can be confused by a trailing partial line --
-            data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
+            data = "\n".join(
+                ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
+            )
 
         try:
             return sniffer.sniff(data, delimiters=",;").delimiter

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.15.9-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.15.9-dev1" # pragma: no cover`