Skip to content

Commit e99e5a8

Browse files
authored
rfctr(file): make FileType enum a file-type descriptor (#3411)
**Summary** Elaborate the `FileType` enum to be a complete descriptor of file-types. Add methods to allow `STR_TO_FILETYPE`, `EXT_TO_FILETYPE` and `FILETYPE_TO_MIMETYPE` mappings to be replaced, removing those redundant and noisy declarations. In the process, fix some lingering file-type identification and `.metadata.filetype` errors that had been skipped in the tests. **Additional Context** Gathering the various attributes of a file-type into the `FileType` enum eliminates the duplication inherent in the separate `STR_TO_FILETYPE` etc. mappings and makes access to those values convenient for callers. These attributes include what MIME-type a file-type should record in metadata and what MIME-types and extensions map to that file-type. These values and others are made available as methods and properties directly on the `FileType` class and members. Because all attributes are defined in the `FileType` enum there is no risk of inconsistency across multiple locations and any changes happen in one and only one place. Further attributes and methods will be added in later commits to support other file-type related operations like mapping to a partitioner and verifying its dependencies are installed.
1 parent 35ee6bf commit e99e5a8

File tree

31 files changed

+326
-336
lines changed

31 files changed

+326
-336
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.15.0-dev14
1+
## 0.15.0-dev15
22

33
### Enhancements
44

test_unstructured/file_utils/test_exploration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pytest
66

77
from unstructured.file_utils import exploration
8-
from unstructured.file_utils.filetype import FileType
8+
from unstructured.file_utils.model import FileType
99

1010
DIRECTORY = pathlib.Path(__file__).parent.resolve()
1111

test_unstructured/file_utils/test_filetype.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222
)
2323
from unstructured.file_utils import filetype
2424
from unstructured.file_utils.filetype import (
25-
FileType,
2625
_detect_filetype_from_octet_stream,
2726
_is_code_mime_type,
2827
_is_text_file_a_csv,
2928
_is_text_file_a_json,
3029
detect_filetype,
3130
)
31+
from unstructured.file_utils.model import FileType
3232

3333
is_in_docker = os.path.exists("/.dockerenv")
3434

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""Test suite for `unstructured.file_utils.filetype`."""
2+
3+
from __future__ import annotations
4+
5+
import pytest
6+
7+
from unstructured.file_utils.model import FileType
8+
9+
10+
class DescribeFileType:
11+
"""Unit-test suite for `unstructured.file_utils.model.Filetype`."""
12+
13+
@pytest.mark.parametrize(
14+
("ext", "file_type"),
15+
[
16+
(".bmp", FileType.BMP),
17+
(".html", FileType.HTML),
18+
(".eml", FileType.EML),
19+
(".p7s", FileType.EML),
20+
(".java", FileType.TXT),
21+
],
22+
)
23+
def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None):
24+
assert FileType.from_extension(ext) is file_type
25+
26+
@pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."])
27+
def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str):
28+
assert FileType.from_extension(ext) is None
29+
30+
@pytest.mark.parametrize(
31+
("mime_type", "file_type"),
32+
[
33+
("image/bmp", FileType.BMP),
34+
("text/x-csv", FileType.CSV),
35+
("application/msword", FileType.DOC),
36+
("message/rfc822", FileType.EML),
37+
("text/plain", FileType.TXT),
38+
("text/yaml", FileType.TXT),
39+
("application/xml", FileType.XML),
40+
("text/xml", FileType.XML),
41+
("inode/x-empty", FileType.EMPTY),
42+
],
43+
)
44+
def it_can_recognize_a_file_type_from_a_mime_type(
45+
self, mime_type: str, file_type: FileType | None
46+
):
47+
assert FileType.from_mime_type(mime_type) is file_type
48+
49+
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"])
50+
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
51+
assert FileType.from_mime_type(mime_type) is None
52+
53+
@pytest.mark.parametrize(
54+
("file_type", "mime_type"),
55+
[
56+
(FileType.BMP, "image/bmp"),
57+
(FileType.CSV, "text/csv"),
58+
(FileType.DOC, "application/msword"),
59+
(FileType.EML, "message/rfc822"),
60+
(FileType.HTML, "text/html"),
61+
(FileType.JPG, "image/jpeg"),
62+
(FileType.PDF, "application/pdf"),
63+
(FileType.TXT, "text/plain"),
64+
(FileType.XML, "application/xml"),
65+
(FileType.EMPTY, "inode/x-empty"),
66+
(FileType.UNK, "application/octet-stream"),
67+
],
68+
)
69+
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
70+
assert file_type.mime_type == mime_type

test_unstructured/partition/test_auto.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
Text,
4646
Title,
4747
)
48-
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
48+
from unstructured.file_utils.model import FileType
4949
from unstructured.partition import auto
5050
from unstructured.partition.auto import IMAGE_FILETYPES, _get_partition_with_extras, partition
5151
from unstructured.partition.utils.constants import PartitionStrategy
@@ -1245,7 +1245,7 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype:
12451245

12461246
assert elements
12471247
assert all(
1248-
e.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
1248+
e.metadata.filetype == filetype.mime_type
12491249
for e in elements
12501250
if e.metadata.filetype is not None
12511251
)

test_unstructured/partition/test_json.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from pytest_mock import MockFixture
1111

1212
from unstructured.documents.elements import CompositeElement
13-
from unstructured.file_utils.filetype import FileType, detect_filetype
13+
from unstructured.file_utils.filetype import detect_filetype
14+
from unstructured.file_utils.model import FileType
1415
from unstructured.partition.email import partition_email
1516
from unstructured.partition.html import partition_html
1617
from unstructured.partition.json import partition_json

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.0-dev14" # pragma: no cover
1+
__version__ = "0.15.0-dev15" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 24 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,7 @@
1212

1313
from unstructured.documents.elements import Element
1414
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
15-
from unstructured.file_utils.model import (
16-
EXT_TO_FILETYPE,
17-
FILETYPE_TO_MIMETYPE,
18-
PLAIN_TEXT_EXTENSIONS,
19-
STR_TO_FILETYPE,
20-
FileType,
21-
)
15+
from unstructured.file_utils.model import PLAIN_TEXT_EXTENSIONS, FileType
2216
from unstructured.logger import logger
2317
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
2418
from unstructured.partition.common import (
@@ -49,9 +43,9 @@ def detect_filetype(
4943

5044
# first check (content_type)
5145
if content_type:
52-
filetype = STR_TO_FILETYPE.get(content_type)
53-
if filetype:
54-
return filetype
46+
file_type = FileType.from_mime_type(content_type)
47+
if file_type:
48+
return file_type
5549

5650
# second check (filename/file_name/file)
5751
# continue if successfully define mime_type
@@ -68,7 +62,7 @@ def detect_filetype(
6862

6963
mime_type = ft.guess_mime(_filename)
7064
if mime_type is None:
71-
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
65+
return FileType.from_extension(extension) or FileType.UNK
7266

7367
elif file is not None:
7468
if hasattr(file, "name"):
@@ -92,7 +86,7 @@ def detect_filetype(
9286
"libmagic is unavailable but assists in filetype detection on file-like objects. "
9387
"Please consider installing libmagic for better results.",
9488
)
95-
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
89+
return FileType.from_extension(extension) or FileType.UNK
9690

9791
else:
9892
raise ValueError("No filename, file, nor file_filename were specified.")
@@ -128,7 +122,7 @@ def detect_filetype(
128122
".tsv",
129123
".json",
130124
]:
131-
return EXT_TO_FILETYPE.get(extension)
125+
return FileType.from_extension(extension)
132126

133127
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
134128
# installed on the Unstructured docker image, .json files resolve to "text/plain"
@@ -151,11 +145,11 @@ def detect_filetype(
151145
return FileType.EML
152146

153147
if extension in PLAIN_TEXT_EXTENSIONS:
154-
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
148+
return FileType.from_extension(extension) or FileType.UNK
155149

156150
# Safety catch
157-
if mime_type in STR_TO_FILETYPE:
158-
return STR_TO_FILETYPE[mime_type]
151+
if file_type := FileType.from_mime_type(mime_type):
152+
return file_type
159153

160154
return FileType.TXT
161155

@@ -165,21 +159,22 @@ def detect_filetype(
165159
elif file:
166160
return _detect_filetype_from_octet_stream(file=file)
167161
else:
168-
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
162+
return FileType.from_extension(extension) or FileType.UNK
169163

170164
elif mime_type == "application/zip":
171-
filetype = FileType.UNK
165+
file_type = FileType.UNK
172166
if file:
173-
filetype = _detect_filetype_from_octet_stream(file=file)
167+
file_type = _detect_filetype_from_octet_stream(file=file)
174168
elif filename is not None:
175169
with open(filename, "rb") as f:
176-
filetype = _detect_filetype_from_octet_stream(file=f)
170+
file_type = _detect_filetype_from_octet_stream(file=f)
177171

178172
extension = extension if extension else ""
179-
if filetype == FileType.UNK:
180-
return FileType.ZIP
181-
else:
182-
return EXT_TO_FILETYPE.get(extension, filetype)
173+
return (
174+
FileType.ZIP
175+
if file_type in (FileType.UNK, FileType.ZIP)
176+
else FileType.from_extension(extension) or file_type
177+
)
183178

184179
elif _is_code_mime_type(mime_type):
185180
# NOTE(robinson) - we'll treat all code files as plain text for now.
@@ -191,14 +186,14 @@ def detect_filetype(
191186
return FileType.EMPTY
192187

193188
# For everything else
194-
elif mime_type in STR_TO_FILETYPE:
195-
return STR_TO_FILETYPE[mime_type]
189+
elif file_type := FileType.from_mime_type(mime_type):
190+
return file_type
196191

197192
logger.warning(
198193
f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
199194
"This file type is not currently supported in unstructured.",
200195
)
201-
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
196+
return FileType.from_extension(extension) or FileType.UNK
202197

203198

204199
def is_json_processable(
@@ -260,7 +255,7 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
260255

261256
# Infer mime type using magic if octet-stream is not zip file
262257
mime_type = magic.from_buffer(file.read(4096), mime=True)
263-
return STR_TO_FILETYPE.get(mime_type, FileType.UNK)
258+
return FileType.from_mime_type(mime_type) or FileType.UNK
264259
logger.warning(
265260
"Could not detect the filetype from application/octet-stream MIME type.",
266261
)
@@ -439,7 +434,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
439434
# NOTE(robinson) - Attached files have already run through this logic
440435
# in their own partitioning function
441436
if element.metadata.attached_to_filename is None:
442-
add_element_metadata(element, filetype=FILETYPE_TO_MIMETYPE[filetype])
437+
add_element_metadata(element, filetype=filetype.mime_type)
443438

444439
return elements
445440
else:

0 commit comments

Comments
 (0)