From 45965c3dea1c3e2843030dd5059a63f9512a1e32 Mon Sep 17 00:00:00 2001
From: CyMule <nhfranck@gmail.com>
Date: Mon, 23 Jun 2025 09:28:39 -0400
Subject: [PATCH 1/2] fix: Use content based file type detection instead of
 client headers

---
 prepline_general/api/filetypes.py | 30 +++++++--------------------
 prepline_general/api/general.py   |  4 +---
 test_general/api/test_app.py      | 34 +++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/prepline_general/api/filetypes.py b/prepline_general/api/filetypes.py
index c010afaa6..6441af436 100644
--- a/prepline_general/api/filetypes.py
+++ b/prepline_general/api/filetypes.py
@@ -17,34 +17,18 @@ def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None
     return content_type.split(";")[0]
 
 
-def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
+def get_validated_mimetype(file: UploadFile) -> Optional[str]:
     """Given the incoming file, identify and return the correct mimetype.
 
-    Order of operations:
-    - If user passed content_type as a form param, take it as truth.
-    - Otherwise, use file.content_type (as set by the Content-Type header)
-    - If no content_type was passed and the header wasn't useful, call the library's detect_filetype
+    Always inspects the actual file bytes to determine the true file type,
+    ignoring client-provided Content-Type headers which can be misleading.
 
     Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
     """
-    content_type: str | None = None
-
-    if content_type_hint is not None:
-        content_type = content_type_hint
-    else:
-        content_type = _remove_optional_info_from_mime_type(file.content_type)
-
-    filetype = FileType.from_mime_type(content_type)
-
-    # If content_type was not specified, use the library to identify the file
-    # We inspect the bytes to do this, so we need to buffer the file
-    if not filetype or filetype == FileType.UNK:
-        file_buffer = BytesIO(file.file.read())
-        file.file.seek(0)
-
-        file_buffer.name = file.filename
-
-        filetype = detect_filetype(file=file_buffer)
+    file_buffer = BytesIO(file.file.read())
+    file.file.seek(0)
+    file_buffer.name = file.filename
+    filetype = detect_filetype(file=file_buffer)
 
     if not filetype.is_partitionable:
         raise HTTPException(
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
index 61d996d2d..b2eba7433 100644
--- a/prepline_general/api/general.py
+++ b/prepline_general/api/general.py
@@ -684,9 +684,7 @@ def general_partition(
 
     def response_generator(is_multipart: bool):
         for file in files:
-            file_content_type = get_validated_mimetype(
-                file, content_type_hint=form_params.content_type
-            )
+            file_content_type = get_validated_mimetype(file)
 
             _file = file.file
 
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
index bc41708bb..636a15dba 100644
--- a/test_general/api/test_app.py
+++ b/test_general/api/test_app.py
@@ -1201,3 +1201,37 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes
         assert "Here are important notes" == df["text"][0]
     else:
         assert "Here are important notes" != df["text"][0]
+
+
+def test_text_file_with_pdf_extension_detected_correctly():
+    """
+    Verify that a text file with a .pdf extension is correctly detected as text/plain
+    instead of failing as a malformed PDF.
+
+    This test validates that the API inspects actual file content rather than
+    trusting client-provided Content-Type headers based on file extensions.
+    """
+    client = TestClient(app)
+
+    with tempfile.NamedTemporaryFile(suffix=".pdf", mode="w", delete=False) as temp_file:
+        temp_file.write("This is simple text content, not a PDF file.")
+        temp_file_path = temp_file.name
+
+    try:
+        # Upload the file without explicitly setting content type
+        # The client will auto-detect Content-Type as application/pdf based on .pdf extension
+        with open(temp_file_path, "rb") as f:
+            response = client.post(
+                MAIN_API_ROUTE, files=[("files", (temp_file_path, f))], data={"strategy": "fast"}
+            )
+
+        assert response.status_code == 200
+
+        elements = response.json()
+        assert len(elements) > 0
+        assert any("This is simple text content" in elem["text"] for elem in elements)
+
+        assert all(elem["metadata"]["filetype"] == "text/plain" for elem in elements)
+
+    finally:
+        os.unlink(temp_file_path)

From b70aa4b6a8272cfaf558c39dfc366f79ab62131d Mon Sep 17 00:00:00 2001
From: CyMule <nhfranck@gmail.com>
Date: Mon, 23 Jun 2025 10:20:35 -0400
Subject: [PATCH 2/2] Fix test

---
 test_general/api/test_app.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
index 636a15dba..6a6280c65 100644
--- a/test_general/api/test_app.py
+++ b/test_general/api/test_app.py
@@ -1218,11 +1218,13 @@ def test_text_file_with_pdf_extension_detected_correctly():
         temp_file_path = temp_file.name
 
     try:
-        # Upload the file without explicitly setting content type
-        # The client will auto-detect Content-Type as application/pdf based on .pdf extension
+        # Upload the file with explicit PDF content type to test that the API
+        # ignores client-provided Content-Type and inspects actual file content
         with open(temp_file_path, "rb") as f:
             response = client.post(
-                MAIN_API_ROUTE, files=[("files", (temp_file_path, f))], data={"strategy": "fast"}
+                MAIN_API_ROUTE,
+                files=[("files", (temp_file_path, f, "application/pdf"))],
+                data={"strategy": "fast"},
             )
 
         assert response.status_code == 200