Skip to content

Commit f440eb4

Browse files
authored
feat: Support encoding parameter in partition_csv (#3564)
See added test file. Added support for the encoding parameter, which can be passed directly to `pd.read_csv`.
1 parent f21c853 commit f440eb4

File tree

7 files changed

+26
-4
lines changed

7 files changed

+26
-4
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
## 0.15.9-dev0
1+
## 0.15.9-dev1
22

33
### Enhancements
44

55
### Features
66

7+
* **Add support for encoding parameter in partition_csv**
8+
79
### Fixes
810

911
* **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
174 Bytes
Binary file not shown.

test_unstructured/partition/test_csv.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ def test_partition_csv_from_filename_with_metadata_filename():
7474
assert elements[0].metadata.filename == "test"
7575

7676

77+
def test_partition_csv_with_encoding():
78+
elements = partition_csv(example_doc_path("stanley-cups-utf-16.csv"), encoding="utf-16")
79+
80+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
81+
82+
7783
@pytest.mark.parametrize(
7884
("filename", "expected_text", "expected_table"),
7985
[
@@ -279,6 +285,7 @@ def it_provides_a_validating_alternate_constructor(self):
279285
ctx = _CsvPartitioningContext.load(
280286
file_path=example_doc_path("stanley-cups.csv"),
281287
file=None,
288+
encoding=None,
282289
metadata_file_path=None,
283290
metadata_last_modified=None,
284291
include_header=True,
@@ -292,6 +299,7 @@ def and_the_validating_constructor_raises_on_an_invalid_context(self):
292299
_CsvPartitioningContext.load(
293300
file_path=None,
294301
file=None,
302+
encoding=None,
295303
metadata_file_path=None,
296304
metadata_last_modified=None,
297305
include_header=True,

typings/pandas/io/parsers/readers.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ from pandas.core.frame import DataFrame
77
def read_csv(
88
filepath_or_buffer: str | IO[bytes],
99
*,
10+
encoding: str | None = ...,
1011
sep: str | None = ...,
1112
header: int | None | Literal["infer"] = ...,
1213
) -> DataFrame: ...

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.9-dev0" # pragma: no cover
1+
__version__ = "0.15.9-dev1" # pragma: no cover

unstructured/partition/auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ def partition(
207207
elements = partition_csv(
208208
filename=filename,
209209
file=file,
210+
encoding=encoding,
210211
infer_table_structure=infer_table_structure,
211212
languages=languages,
212213
detect_language_per_element=detect_language_per_element,

unstructured/partition/csv.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
def partition_csv(
3030
filename: str | None = None,
3131
file: IO[bytes] | None = None,
32+
encoding: str | None = None,
3233
metadata_filename: str | None = None,
3334
metadata_last_modified: str | None = None,
3435
include_header: bool = False,
@@ -47,6 +48,8 @@ def partition_csv(
4748
A string defining the target filename path.
4849
file
4950
A file-like object using "rb" mode --> open(filename, "rb").
51+
encoding
52+
The encoding method used to decode the text input. If None, utf-8 will be used.
5053
metadata_filename
5154
The filename to use for the metadata.
5255
metadata_last_modified
@@ -73,6 +76,7 @@ def partition_csv(
7376
ctx = _CsvPartitioningContext(
7477
file_path=filename,
7578
file=file,
79+
encoding=encoding,
7680
metadata_file_path=metadata_filename,
7781
metadata_last_modified=metadata_last_modified,
7882
include_header=include_header,
@@ -81,7 +85,7 @@ def partition_csv(
8185
)
8286

8387
with ctx.open() as file:
84-
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
88+
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)
8589

8690
html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
8791
text = soupparser_fromstring(html_text).text_content()
@@ -110,6 +114,7 @@ def __init__(
110114
self,
111115
file_path: str | None = None,
112116
file: IO[bytes] | None = None,
117+
encoding: str | None = None,
113118
metadata_file_path: str | None = None,
114119
metadata_last_modified: str | None = None,
115120
include_header: bool = False,
@@ -118,6 +123,7 @@ def __init__(
118123
):
119124
self._file_path = file_path
120125
self._file = file
126+
self._encoding = encoding
121127
self._metadata_file_path = metadata_file_path
122128
self._metadata_last_modified = metadata_last_modified
123129
self._include_header = include_header
@@ -129,6 +135,7 @@ def load(
129135
cls,
130136
file_path: str | None,
131137
file: IO[bytes] | None,
138+
encoding: str | None,
132139
metadata_file_path: str | None,
133140
metadata_last_modified: str | None,
134141
include_header: bool,
@@ -138,6 +145,7 @@ def load(
138145
return cls(
139146
file_path=file_path,
140147
file=file,
148+
encoding=encoding,
141149
metadata_file_path=metadata_file_path,
142150
metadata_last_modified=metadata_last_modified,
143151
include_header=include_header,
@@ -156,7 +164,9 @@ def delimiter(self) -> str | None:
156164

157165
with self.open() as file:
158166
# -- read whole lines, sniffer can be confused by a trailing partial line --
159-
data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
167+
data = "\n".join(
168+
ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
169+
)
160170

161171
try:
162172
return sniffer.sniff(data, delimiters=",;").delimiter

0 commit comments

Comments
 (0)