Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 7 additions & 23 deletions prepline_general/api/filetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,34 +17,18 @@ def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None
return content_type.split(";")[0]


def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
"""Given the incoming file, identify and return the correct mimetype.

Order of operations:
- If user passed content_type as a form param, take it as truth.
- Otherwise, use file.content_type (as set by the Content-Type header)
- If no content_type was passed and the header wasn't useful, call the library's detect_filetype
Always inspects the actual file bytes to determine the true file type,
ignoring client-provided Content-Type headers which can be misleading.

Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
"""
content_type: str | None = None

if content_type_hint is not None:
content_type = content_type_hint
else:
content_type = _remove_optional_info_from_mime_type(file.content_type)

filetype = FileType.from_mime_type(content_type)

# If content_type was not specified, use the library to identify the file
# We inspect the bytes to do this, so we need to buffer the file
if not filetype or filetype == FileType.UNK:
file_buffer = BytesIO(file.file.read())
file.file.seek(0)

file_buffer.name = file.filename

filetype = detect_filetype(file=file_buffer)
file_buffer = BytesIO(file.file.read())
file.file.seek(0)
file_buffer.name = file.filename
filetype = detect_filetype(file=file_buffer)

if not filetype.is_partitionable:
raise HTTPException(
Expand Down
4 changes: 1 addition & 3 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,9 +684,7 @@ def general_partition(

def response_generator(is_multipart: bool):
for file in files:
file_content_type = get_validated_mimetype(
file, content_type_hint=form_params.content_type
)
file_content_type = get_validated_mimetype(file)

_file = file.file

Expand Down
36 changes: 36 additions & 0 deletions test_general/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,3 +1201,39 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes
assert "Here are important notes" == df["text"][0]
else:
assert "Here are important notes" != df["text"][0]


def test_text_file_with_pdf_extension_detected_correctly():
"""
Verify that a text file with a .pdf extension is correctly detected as text/plain
instead of failing as a malformed PDF.

This test validates that the API inspects actual file content rather than
trusting client-provided Content-Type headers based on file extensions.
"""
client = TestClient(app)

with tempfile.NamedTemporaryFile(suffix=".pdf", mode="w", delete=False) as temp_file:
temp_file.write("This is simple text content, not a PDF file.")
temp_file_path = temp_file.name

try:
# Upload the file with explicit PDF content type to test that the API
# ignores client-provided Content-Type and inspects actual file content
with open(temp_file_path, "rb") as f:
response = client.post(
MAIN_API_ROUTE,
files=[("files", (temp_file_path, f, "application/pdf"))],
data={"strategy": "fast"},
)

assert response.status_code == 200

elements = response.json()
assert len(elements) > 0
assert any("This is simple text content" in elem["text"] for elem in elements)

assert all(elem["metadata"]["filetype"] == "text/plain" for elem in elements)

finally:
os.unlink(temp_file_path)
Loading