Skip to content

Commit 0289ca3

Browse files
authored
fix: handle encoding for text file checks (#707)
* fixed encoding issue for _is_text_file_a_json * changelog and version
1 parent b2b92ea commit 0289ca3

File tree

5 files changed

+35
-9
lines changed

5 files changed

+35
-9
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.7.3-dev2
1+
## 0.7.3-dev3
22

33
### Enhancements
44

test_unstructured/file_utils/test_filetype.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,16 @@ def test_is_text_file_a_csv(content, expected):
395395

396396
with BytesIO(content) as f:
397397
assert _is_text_file_a_csv(file=f) == expected
398+
399+
400+
def test_csv_json_check_with_filename_and_utf_32(filename="example-docs/fake-text-utf-32.txt"):
401+
assert _is_text_file_a_csv(filename=filename) is False
402+
assert _is_text_file_a_json(filename=filename) is False
403+
404+
405+
def test_csv_json_check_with_file_and_utf_32(filename="example-docs/fake-text-utf-32.txt"):
406+
with open(filename, "rb") as f:
407+
assert _is_text_file_a_csv(file=f) is False
408+
409+
with open(filename, "rb") as f:
410+
assert _is_text_file_a_json(file=f) is False

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.3-dev2" # pragma: no cover
1+
__version__ = "0.7.3-dev3" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import IO, Callable, List, Optional
88

99
from unstructured.documents.elements import Element, PageBreak
10+
from unstructured.file_utils.encoding import detect_file_encoding
1011
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
1112
from unstructured.partition.common import (
1213
_add_element_metadata,
@@ -190,6 +191,7 @@ def detect_filetype(
190191
content_type: Optional[str] = None,
191192
file: Optional[IO] = None,
192193
file_filename: Optional[str] = None,
194+
encoding: Optional[str] = "utf-8",
193195
) -> Optional[FileType]:
194196
"""Use libmagic to determine a file's type. Helps determine which partition brick
195197
to use for a given file. A return value of None indicates a non-supported file type.
@@ -257,10 +259,10 @@ def detect_filetype(
257259
elif extension and extension == ".html":
258260
return FileType.HTML
259261

260-
if _is_text_file_a_json(file=file, filename=filename):
262+
if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
261263
return FileType.JSON
262264

263-
if _is_text_file_a_csv(file=file, filename=filename):
265+
if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
264266
return FileType.CSV
265267

266268
if file and not extension and _check_eml_from_buffer(file=file) is True:
@@ -333,6 +335,7 @@ def _detect_filetype_from_octet_stream(file: IO) -> FileType:
333335
def _read_file_start_for_type_check(
334336
filename: Optional[str] = None,
335337
file: Optional[IO] = None,
338+
encoding: Optional[str] = "utf-8",
336339
) -> str:
337340
"""Reads the start of the file and returns the text content."""
338341
exactly_one(filename=filename, file=file)
@@ -345,26 +348,33 @@ def _read_file_start_for_type_check(
345348
file_text = file_content.decode(errors="ignore")
346349
file.seek(0)
347350
if filename is not None:
348-
with open(filename) as f:
349-
file_text = f.read(4096)
351+
try:
352+
with open(filename, encoding=encoding) as f:
353+
file_text = f.read(4096)
354+
except UnicodeDecodeError:
355+
encoding, _ = detect_file_encoding(filename=filename)
356+
with open(filename, encoding=encoding) as f:
357+
file_text = f.read(4096)
350358
return file_text
351359

352360

353361
def _is_text_file_a_json(
354362
filename: Optional[str] = None,
355363
file: Optional[IO] = None,
364+
encoding: Optional[str] = "utf-8",
356365
):
357366
"""Detects if a file that has a text/plain MIME type is a JSON file."""
358-
file_text = _read_file_start_for_type_check(file=file, filename=filename)
367+
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
359368
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
360369

361370

362371
def _is_text_file_a_csv(
363372
filename: Optional[str] = None,
364373
file: Optional[IO] = None,
374+
encoding: Optional[str] = "utf-8",
365375
):
366376
"""Detects if a file that has a text/plain MIME type is a CSV file."""
367-
file_text = _read_file_start_for_type_check(file=file, filename=filename)
377+
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
368378
lines = file_text.strip().splitlines()
369379
if len(lines) < 2:
370380
return False

unstructured/partition/auto.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def partition(
112112
file=file,
113113
file_filename=file_filename,
114114
content_type=content_type,
115+
encoding=encoding,
115116
)
116117

117118
if file is not None:
@@ -230,5 +231,7 @@ def file_and_type_from_url(
230231
file = io.BytesIO(response.content)
231232

232233
content_type = content_type or response.headers.get("Content-Type")
233-
filetype = detect_filetype(file=file, content_type=content_type)
234+
encoding = response.headers.get("Content-Encoding", "utf-8")
235+
236+
filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding)
234237
return file, filetype

0 commit comments

Comments
 (0)