diff --git a/prepline_general/api/filetypes.py b/prepline_general/api/filetypes.py index c010afaa..6441af43 100644 --- a/prepline_general/api/filetypes.py +++ b/prepline_general/api/filetypes.py @@ -17,34 +17,18 @@ def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None return content_type.split(";")[0] -def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]: +def get_validated_mimetype(file: UploadFile) -> Optional[str]: """Given the incoming file, identify and return the correct mimetype. - Order of operations: - - If user passed content_type as a form param, take it as truth. - - Otherwise, use file.content_type (as set by the Content-Type header) - - If no content_type was passed and the header wasn't useful, call the library's detect_filetype + Always inspects the actual file bytes to determine the true file type, + ignoring client-provided Content-Type headers which can be misleading. Once we have a filteype, check is_partitionable and return 400 if we don't support this file. """ - content_type: str | None = None - - if content_type_hint is not None: - content_type = content_type_hint - else: - content_type = _remove_optional_info_from_mime_type(file.content_type) - - filetype = FileType.from_mime_type(content_type) - - # If content_type was not specified, use the library to identify the file - # We inspect the bytes to do this, so we need to buffer the file - if not filetype or filetype == FileType.UNK: - file_buffer = BytesIO(file.file.read()) - file.file.seek(0) - - file_buffer.name = file.filename - - filetype = detect_filetype(file=file_buffer) + file_buffer = BytesIO(file.file.read()) + file.file.seek(0) + file_buffer.name = file.filename + filetype = detect_filetype(file=file_buffer) if not filetype.is_partitionable: raise HTTPException( diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 61d996d2..b2eba743 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -684,9 +684,7 @@ def general_partition( def response_generator(is_multipart: bool): for file in files: - file_content_type = get_validated_mimetype( - file, content_type_hint=form_params.content_type - ) + file_content_type = get_validated_mimetype(file) _file = file.file diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index bc41708b..6a6280c6 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1201,3 +1201,39 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes assert "Here are important notes" == df["text"][0] else: assert "Here are important notes" != df["text"][0] + + +def test_text_file_with_pdf_extension_detected_correctly(): + """ + Verify that a text file with a .pdf extension is correctly detected as text/plain + instead of failing as a malformed PDF. + + This test validates that the API inspects actual file content rather than + trusting client-provided Content-Type headers based on file extensions. + """ + client = TestClient(app) + + with tempfile.NamedTemporaryFile(suffix=".pdf", mode="w", delete=False) as temp_file: + temp_file.write("This is simple text content, not a PDF file.") + temp_file_path = temp_file.name + + try: + # Upload the file with explicit PDF content type to test that the API + # ignores client-provided Content-Type and inspects actual file content + with open(temp_file_path, "rb") as f: + response = client.post( + MAIN_API_ROUTE, + files=[("files", (temp_file_path, f, "application/pdf"))], + data={"strategy": "fast"}, + ) + + assert response.status_code == 200 + + elements = response.json() + assert len(elements) > 0 + assert any("This is simple text content" in elem["text"] for elem in elements) + + assert all(elem["metadata"]["filetype"] == "text/plain" for elem in elements) + + finally: + os.unlink(temp_file_path)