Skip to content

Commit b2b92ea

Browse files
authored
fix: filetype detection if a CSV has a text/plain MIME type (#691)
* fix: Filetype detection if a CSV has a text/plain MIME type #621 * bug: fix csv detection and create _read_file_start_for_type_check func * fix: Make call to _is_text_file_a_csv from detect_filetype
1 parent c1ba090 commit b2b92ea

File tree

6 files changed

+56
-12
lines changed

6 files changed

+56
-12
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.7.3-dev1
1+
## 0.7.3-dev2
22

33
### Enhancements
44

@@ -8,6 +8,7 @@
88

99
### Fixes
1010

11+
* Filetype detection if a CSV has a `text/plain` MIME type
1112
* `convert_office_doc` no longers prints file conversion info messages to stdout.
1213
* `partition_via_api` reflects the actual filetype for the file processed in the API.
1314

test_unstructured/file_utils/test_filetype.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from unstructured.file_utils.filetype import (
1010
FileType,
1111
_is_code_mime_type,
12+
_is_text_file_a_csv,
1213
_is_text_file_a_json,
1314
detect_filetype,
1415
)
@@ -368,11 +369,29 @@ def test_filetype_order():
368369
@pytest.mark.parametrize(
369370
("content", "expected"),
370371
[
371-
(b"d\xe2\x80", False),
372+
(b"d\xe2\x80", False), # Invalid JSON
373+
(b'[{"key": "value"}]', True), # Valid JSON
374+
(b"", False), # Empty content
372375
],
373376
)
374377
def test_is_text_file_a_json(content, expected):
375378
from io import BytesIO
376379

377380
with BytesIO(content) as f:
378381
assert _is_text_file_a_json(file=f) == expected
382+
383+
384+
@pytest.mark.parametrize(
385+
("content", "expected"),
386+
[
387+
(b"d\xe2\x80", False), # Invalid CSV
388+
(b'[{"key": "value"}]', False), # Invalid CSV
389+
(b"column1,column2,column3\nvalue1,value2,value3\n", True), # Valid CSV
390+
(b"", False), # Empty content
391+
],
392+
)
393+
def test_is_text_file_a_csv(content, expected):
394+
from io import BytesIO
395+
396+
with BytesIO(content) as f:
397+
assert _is_text_file_a_csv(file=f) == expected

test_unstructured/partition/test_auto.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,6 @@ def test_auto_partition_works_with_unstructured_jsons():
502502

503503
def test_auto_partition_works_with_unstructured_jsons_from_file():
504504
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
505-
506505
with open(filename, "rb") as f:
507506
elements = partition(file=f, strategy="hi_res")
508507
assert elements[0].text == "News Around NOAA"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.3-dev1" # pragma: no cover
1+
__version__ = "0.7.3-dev2" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,9 @@ def detect_filetype(
260260
if _is_text_file_a_json(file=file, filename=filename):
261261
return FileType.JSON
262262

263+
if _is_text_file_a_csv(file=file, filename=filename):
264+
return FileType.CSV
265+
263266
if file and not extension and _check_eml_from_buffer(file=file) is True:
264267
return FileType.EML
265268

@@ -327,14 +330,12 @@ def _detect_filetype_from_octet_stream(file: IO) -> FileType:
327330
return FileType.UNK
328331

329332

330-
def _is_text_file_a_json(
333+
def _read_file_start_for_type_check(
331334
filename: Optional[str] = None,
332-
content_type: Optional[str] = None,
333335
file: Optional[IO] = None,
334-
):
335-
"""Detects if a file that has a text/plain MIME type is a JSON file."""
336+
) -> str:
337+
"""Reads the start of the file and returns the text content."""
336338
exactly_one(filename=filename, file=file)
337-
338339
if file is not None:
339340
file.seek(0)
340341
file_content = file.read(4096)
@@ -343,13 +344,37 @@ def _is_text_file_a_json(
343344
else:
344345
file_text = file_content.decode(errors="ignore")
345346
file.seek(0)
346-
elif filename is not None:
347+
if filename is not None:
347348
with open(filename) as f:
348-
file_text = f.read()
349+
file_text = f.read(4096)
350+
return file_text
351+
349352

353+
def _is_text_file_a_json(
354+
filename: Optional[str] = None,
355+
file: Optional[IO] = None,
356+
):
357+
"""Detects if a file that has a text/plain MIME type is a JSON file."""
358+
file_text = _read_file_start_for_type_check(file=file, filename=filename)
350359
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
351360

352361

362+
def _is_text_file_a_csv(
363+
filename: Optional[str] = None,
364+
file: Optional[IO] = None,
365+
):
366+
"""Detects if a file that has a text/plain MIME type is a CSV file."""
367+
file_text = _read_file_start_for_type_check(file=file, filename=filename)
368+
lines = file_text.strip().splitlines()
369+
if len(lines) < 2:
370+
return False
371+
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
372+
header = lines[0].split(",")
373+
if any("," not in line for line in lines):
374+
return False
375+
return all(len(line.split(",")) == len(header) for line in lines[:-1])
376+
377+
353378
def _check_eml_from_buffer(file: IO) -> bool:
354379
"""Checks if a text/plain file is actually a .eml file. Uses a regex pattern to see if the
355380
start of the file matches the typical pattern for a .eml file."""
@@ -359,7 +384,6 @@ def _check_eml_from_buffer(file: IO) -> bool:
359384
file_head = file_content.decode("utf-8", errors="ignore")
360385
else:
361386
file_head = file_content
362-
363387
return EMAIL_HEAD_RE.match(file_head) is not None
364388

365389

unstructured/nlp/patterns.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,4 @@
105105
# NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
106106
# format for document elements
107107
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
108+
JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$"

0 commit comments

Comments
 (0)