From 45965c3dea1c3e2843030dd5059a63f9512a1e32 Mon Sep 17 00:00:00 2001 From: CyMule Date: Mon, 23 Jun 2025 09:28:39 -0400 Subject: [PATCH 1/2] fix: Use content based file type detection instead of client headers --- prepline_general/api/filetypes.py | 30 +++++++-------------------- prepline_general/api/general.py | 4 +--- test_general/api/test_app.py | 34 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/prepline_general/api/filetypes.py b/prepline_general/api/filetypes.py index c010afaa6..6441af436 100644 --- a/prepline_general/api/filetypes.py +++ b/prepline_general/api/filetypes.py @@ -17,34 +17,18 @@ def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None return content_type.split(";")[0] -def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]: +def get_validated_mimetype(file: UploadFile) -> Optional[str]: """Given the incoming file, identify and return the correct mimetype. - Order of operations: - - If user passed content_type as a form param, take it as truth. - - Otherwise, use file.content_type (as set by the Content-Type header) - - If no content_type was passed and the header wasn't useful, call the library's detect_filetype + Always inspects the actual file bytes to determine the true file type, + ignoring client-provided Content-Type headers which can be misleading. Once we have a filteype, check is_partitionable and return 400 if we don't support this file. """ - content_type: str | None = None - - if content_type_hint is not None: - content_type = content_type_hint - else: - content_type = _remove_optional_info_from_mime_type(file.content_type) - - filetype = FileType.from_mime_type(content_type) - - # If content_type was not specified, use the library to identify the file - # We inspect the bytes to do this, so we need to buffer the file - if not filetype or filetype == FileType.UNK: - file_buffer = BytesIO(file.file.read()) - file.file.seek(0) - - file_buffer.name = file.filename - - filetype = detect_filetype(file=file_buffer) + file_buffer = BytesIO(file.file.read()) + file.file.seek(0) + file_buffer.name = file.filename + filetype = detect_filetype(file=file_buffer) if not filetype.is_partitionable: raise HTTPException( diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 61d996d2d..b2eba7433 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -684,9 +684,7 @@ def general_partition( def response_generator(is_multipart: bool): for file in files: - file_content_type = get_validated_mimetype( - file, content_type_hint=form_params.content_type - ) + file_content_type = get_validated_mimetype(file) _file = file.file diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index bc41708bb..636a15dba 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1201,3 +1201,37 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes assert "Here are important notes" == df["text"][0] else: assert "Here are important notes" != df["text"][0] + + +def test_text_file_with_pdf_extension_detected_correctly(): + """ + Verify that a text file with a .pdf extension is correctly detected as text/plain + instead of failing as a malformed PDF. + + This test validates that the API inspects actual file content rather than + trusting client-provided Content-Type headers based on file extensions. + """ + client = TestClient(app) + + with tempfile.NamedTemporaryFile(suffix=".pdf", mode="w", delete=False) as temp_file: + temp_file.write("This is simple text content, not a PDF file.") + temp_file_path = temp_file.name + + try: + # Upload the file without explicitly setting content type + # The client will auto-detect Content-Type as application/pdf based on .pdf extension + with open(temp_file_path, "rb") as f: + response = client.post( + MAIN_API_ROUTE, files=[("files", (temp_file_path, f))], data={"strategy": "fast"} + ) + + assert response.status_code == 200 + + elements = response.json() + assert len(elements) > 0 + assert any("This is simple text content" in elem["text"] for elem in elements) + + assert all(elem["metadata"]["filetype"] == "text/plain" for elem in elements) + + finally: + os.unlink(temp_file_path) From b70aa4b6a8272cfaf558c39dfc366f79ab62131d Mon Sep 17 00:00:00 2001 From: CyMule Date: Mon, 23 Jun 2025 10:20:35 -0400 Subject: [PATCH 2/2] Fix test --- test_general/api/test_app.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index 636a15dba..6a6280c65 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1218,11 +1218,13 @@ def test_text_file_with_pdf_extension_detected_correctly(): temp_file_path = temp_file.name try: - # Upload the file without explicitly setting content type - # The client will auto-detect Content-Type as application/pdf based on .pdf extension + # Upload the file with explicit PDF content type to test that the API + # ignores client-provided Content-Type and inspects actual file content with open(temp_file_path, "rb") as f: response = client.post( - MAIN_API_ROUTE, files=[("files", (temp_file_path, f))], data={"strategy": "fast"} + MAIN_API_ROUTE, + files=[("files", (temp_file_path, f, "application/pdf"))], + data={"strategy": "fast"}, ) assert response.status_code == 200