Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.16.22-dev0

### Fixes

* **Handle filenames without extensions in file type detection**

## 0.16.21

### Enhancements
Expand Down
22 changes: 22 additions & 0 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,8 @@ def and_it_derives_the_extension_from_metadata_file_path_when_file_object_has_no
None,
# -- case 2: file-like object has `.name` attribute but it's value is the empty string
"",
# -- case 3: file-like object has name with no extension --
"q3_invoices",
],
)
def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_sources(
Expand All @@ -621,6 +623,26 @@ def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name

assert _FileTypeDetectionContext(file=file).extension == ""

@pytest.mark.parametrize(
"file_name",
[
# -- case 1: file-like object has no `.name` attribute
None,
# -- case 2: file-like object has `.name` attribute but it's value is the empty string
"",
# -- case 3: file-like object has name with no extension --
"q3_invoices",
],
)
def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_nor_metadata(
self, file_name: str | None
):
with open(example_doc_path("ideas-page.html"), "rb") as f:
file = io.BytesIO(f.read())
file.name = None

assert _FileTypeDetectionContext(file=file, metadata_file_path=file_name).extension == ""

# -- .file_head ---------------------------------------------

def it_grabs_the_first_8k_bytes_of_the_file_for_use_by_magic(self):
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.21" # pragma: no cover
__version__ = "0.16.22-dev0" # pragma: no cover
8 changes: 6 additions & 2 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,15 @@ def extension(self) -> str:
# -- get from file_path, or file when it has a name (path) --
with self.open() as file:
if hasattr(file, "name") and file.name:
return os.path.splitext(file.name)[1].lower()
splitext = os.path.splitext(file.name)
if len(splitext) > 1:
return splitext[1].lower()

# -- otherwise use metadata file-path when provided --
if file_path := self._metadata_file_path:
return os.path.splitext(file_path)[1].lower()
splitext = os.path.splitext(file_path)
if len(splitext) > 1:
return splitext[1].lower()

# -- otherwise empty str means no extension, same as a path like "a/b/name-no-ext" --
return ""
Expand Down
Loading