Skip to content

Commit 956f04d

Browse files
feat: detect filetype with extension if libmagic is unavailable (#268)
* included the previous PR changes and verified black * resolved the issues mentioned * make tidy and add tests --------- Co-authored-by: Matt Robinson <[email protected]> Co-authored-by: Matt Robinson <[email protected]>
1 parent e419ba1 commit 956f04d

File tree

4 files changed

+60
-4
lines changed

4 files changed

+60
-4
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.4.16-dev0
2+
3+
### Enhancements
4+
5+
* Fallback to using file extensions for filetype detection if `libmagic` is not present
6+
17
## 0.4.15
28

39
### Enhancements
@@ -199,3 +205,4 @@ of an email.
199205
## 0.2.0
200206

201207
* Initial release of unstructured
208+

test_unstructured/file_utils/test_filetype.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import magic
77

8+
import unstructured.file_utils.filetype as filetype
89
from unstructured.file_utils.filetype import (
910
detect_filetype,
1011
FileType,
@@ -36,6 +37,27 @@ def test_detect_filetype_from_filename(file, expected):
3637
assert detect_filetype(filename) == expected
3738

3839

40+
@pytest.mark.parametrize(
41+
"file, expected",
42+
[
43+
("layout-parser-paper-fast.pdf", FileType.PDF),
44+
("fake.docx", FileType.DOCX),
45+
("example.jpg", FileType.JPG),
46+
("fake-text.txt", FileType.TXT),
47+
("fake-email.eml", FileType.EML),
48+
("factbook.xml", FileType.XML),
49+
("example-10k.html", FileType.HTML),
50+
("fake-html.html", FileType.HTML),
51+
("fake-excel.xlsx", FileType.XLSX),
52+
("fake-power-point.pptx", FileType.PPTX),
53+
],
54+
)
55+
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
56+
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
57+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
58+
assert detect_filetype(filename) == expected
59+
60+
3961
@pytest.mark.parametrize(
4062
"file, expected",
4163
[
@@ -60,6 +82,14 @@ def test_detect_filetype_from_file(file, expected):
6082
assert detect_filetype(file=f) in expected
6183

6284

85+
def test_detect_filetype_from_file_raises_without_libmagic(monkeypatch):
86+
monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
87+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
88+
with open(filename, "rb") as f:
89+
with pytest.raises(ImportError):
90+
detect_filetype(file=f)
91+
92+
6393
def test_detect_xml_application_xml(monkeypatch):
6494
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
6595
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.15" # pragma: no cover
1+
__version__ = "0.4.16-dev0" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@
33
from typing import IO, Optional
44
import zipfile
55

6-
import magic
6+
try:
7+
import magic
8+
9+
LIBMAGIC_AVAILABLE = True
10+
except ImportError: # pragma: nocover
11+
LIBMAGIC_AVAILABLE = False # pragma: nocover
12+
713

814
from unstructured.logger import logger
915
from unstructured.nlp.patterns import EMAIL_HEAD_RE
@@ -91,7 +97,9 @@ def __lt__(self, other):
9197
".pdf": FileType.PDF,
9298
".docx": FileType.DOCX,
9399
".jpg": FileType.JPG,
100+
".jpeg": FileType.JPG,
94101
".txt": FileType.TXT,
102+
".text": FileType.TXT,
95103
".eml": FileType.EML,
96104
".xml": FileType.XML,
97105
".html": FileType.HTML,
@@ -117,13 +125,24 @@ def detect_filetype(
117125
if filename:
118126
_, extension = os.path.splitext(filename)
119127
extension = extension.lower()
120-
mime_type = magic.from_file(filename, mime=True)
128+
if LIBMAGIC_AVAILABLE:
129+
mime_type = None
130+
mime_type = magic.from_file(filename, mime=True)
131+
else:
132+
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
121133
elif file is not None:
122134
extension = None
123135
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
124136
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
125137
# ref: https://github.com/ahupp/python-magic#usage
126-
mime_type = magic.from_buffer(file.read(4096), mime=True)
138+
if LIBMAGIC_AVAILABLE:
139+
mime_type = magic.from_buffer(file.read(4096), mime=True)
140+
else:
141+
raise ImportError(
142+
"libmagic is unavailable. "
143+
"Filetype detection on file-like objects requires libmagic. "
144+
"Please install libmagic and try again."
145+
)
127146
else:
128147
raise ValueError("No filename nor file were specified.")
129148

0 commit comments

Comments
 (0)