Skip to content

Commit 99fff81

Browse files
ds-filipknefelFilip Knefel
andauthored
fix: receive csv when output_format=text/csv (#373)
Fixed bug where `output_format=text/csv` would return JSON formatted output. Made it so that `accept` headers are no longer used to determine output format except for multipart/mixed case. --------- Co-authored-by: Filip Knefel <[email protected]>
1 parent 0e72b67 commit 99fff81

File tree

4 files changed

+45
-34
lines changed

4 files changed

+45
-34
lines changed

README.md

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -322,14 +322,15 @@ The output format can also be set to `text/csv` to get the data in csv format ra
322322

323323
The response will be a list of the extracted elements in csv format:
324324
```
325-
"type,text,element_id,filename,page_number,url,sent_from,sent_to,subject,sender\n
326-
UncategorizedText,\"Hi,\",bc50944723f014607ad612b6983944a7,alert.eml,1,,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],ALERT: Stolen Lunch,Mallori Harrell <[email protected]>\n
327-
NarrativeText,\"It has come to our attention that as of 9:00am this morning, Harold's lunch is missing. If this was done in error please return the lunch immediately to the fridge on the 2nd floor by noon.\",51944d1f63f9472edb165fb3c9e5c525,alert.eml,1,,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],ALERT: Stolen Lunch,Mallori Harrell <[email protected]>\n
328-
NarrativeText,\"If the lunch has not been returned by noon, we will be reviewing camera footage to determine who stole Harold's lunch.\",8e8f9e2e50e39e072fda08d277aa77b9,alert.eml,1,,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],ALERT: Stolen Lunch,Mallori Harrell <[email protected]>\n
329-
NarrativeText,The perpetrators will be PUNISHED to the full extent of our employee code of conduct handbook.,736a826679b971f594103fd9751e5c8f,alert.eml,1,,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],ALERT: Stolen Lunch,Mallori Harrell <[email protected]>\n
330-
UncategorizedText,\"Thank you for your time,\",3eeae5f64dab54c52dd5fff779808071,alert.eml,1,,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],ALERT: Stolen Lunch,Mallori Harrell <[email protected]>\n
331-
Title,Unstructured Technologies,d5b612de8cd918addd9569b0255b65b2,alert.eml,1,,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],ALERT: Stolen Lunch,Mallori Harrell <[email protected]>\n
332-
Title,Data Scientist,46b174f1ec7c25d23e5e50ffff0cc55b,alert.eml,1,,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],ALERT: Stolen Lunch,Mallori Harrell <[email protected]>\n"
325+
type,element_id,text,filename,sent_from,sent_to,subject,languages,filetype
326+
UncategorizedText,db1ca22813f01feda8759ff04a844e56,"Hi All,",family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
327+
NarrativeText,a663c393a5e143c01ef2bb5c98efa2c1,Get excited for our first annual family day! ,family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
328+
NarrativeText,ce65ca3bef59957d3f1c2bab5725c82f,"There will be face painting, a petting zoo, funnel cake and more.",family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
329+
NarrativeText,d7bcf988af9f06042d83e25c531e5744,Make sure to RSVP!,family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
330+
Title,5550577db69c2c8aabcd90979698120a,Best.,family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
331+
Title,ca1c571d993b6c1ed8ef56a06c16ba22,Mallori Harrell,family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
332+
Title,d5b612de8cd918addd9569b0255b65b2,Unstructured Technologies,family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
333+
Title,2e0b9e8ee04b9594a9c26d8535b818ff,Data Scientist,family-day.eml,['Mallori Harrell <[email protected]>'],['Mallori Harrell <[email protected]>'],Family Day,['eng'],message/rfc822
333334
```
334335

335336
#### Parallel Mode for PDFs

prepline_general/api/general.py

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -751,12 +751,6 @@ def general_partition(
751751
files[file_index], form_params.gz_uncompressed_content_type
752752
)
753753

754-
default_response_type = form_params.output_format or "application/json"
755-
if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
756-
media_type = default_response_type
757-
else:
758-
media_type = content_type
759-
760754
def response_generator(is_multipart: bool):
761755
for file in files:
762756
file_content_type = get_validated_mimetype(file)
@@ -775,7 +769,7 @@ def response_generator(is_multipart: bool):
775769
skip_infer_table_types=form_params.skip_infer_table_types,
776770
strategy=form_params.strategy,
777771
xml_keep_tags=form_params.xml_keep_tags,
778-
response_type=media_type,
772+
response_type=form_params.output_format,
779773
filename=str(file.filename),
780774
file_content_type=file_content_type,
781775
languages=form_params.languages,
@@ -790,27 +784,12 @@ def response_generator(is_multipart: bool):
790784
overlap_all=form_params.overlap_all,
791785
)
792786

793-
if not is_compatible_response_type(media_type, type(response)):
794-
raise HTTPException(
795-
detail=(
796-
f"Conflict in media type {media_type}"
797-
f" with response type {type(response)}.\n"
798-
),
799-
status_code=status.HTTP_406_NOT_ACCEPTABLE,
800-
)
801-
802-
if media_type not in ["application/json", "text/csv", "*/*", "multipart/mixed"]:
803-
raise HTTPException(
804-
detail=f"Unsupported media type {media_type}.\n",
805-
status_code=status.HTTP_406_NOT_ACCEPTABLE,
806-
)
807-
808787
yield (
809788
json.dumps(response)
810789
if is_multipart and type(response) not in [str, bytes]
811790
else (
812791
PlainTextResponse(response)
813-
if not is_multipart and media_type == "text/csv"
792+
if not is_multipart and form_params.output_format == "text/csv"
814793
else response
815794
)
816795
)
@@ -819,7 +798,7 @@ def join_responses(
819798
responses: Sequence[str | List[Dict[str, Any]] | PlainTextResponse]
820799
) -> List[str | List[Dict[str, Any]]] | PlainTextResponse:
821800
"""Consolidate partitionings from multiple documents into single response payload."""
822-
if media_type != "text/csv":
801+
if form_params.output_format != "text/csv":
823802
return cast(List[Union[str, List[Dict[str, Any]]]], responses)
824803
responses = cast(List[PlainTextResponse], responses)
825804
data = pd.read_csv( # pyright: ignore[reportUnknownMemberType]
@@ -836,7 +815,9 @@ def join_responses(
836815
return PlainTextResponse(data.to_csv())
837816

838817
return (
839-
MultipartMixedResponse(response_generator(is_multipart=True), content_type=media_type)
818+
MultipartMixedResponse(
819+
response_generator(is_multipart=True), content_type=form_params.output_format
820+
)
840821
if content_type == "multipart/mixed"
841822
else (
842823
list(response_generator(is_multipart=False))[0]

prepline_general/api/models/form_params.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def as_form(
8585
),
8686
] = None,
8787
output_format: Annotated[
88-
str,
88+
Literal["application/json", "text/csv"],
8989
Form(
9090
title="Output Format",
9191
description="The format of the response. Supported formats are application/json and text/csv. Default: application/json.",

test_general/api/test_app.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -975,3 +975,32 @@ def test_get_request():
975975
response = client.get("/general/v0/general")
976976
assert response.status_code == 405
977977
assert response.json() == {"detail": "Only POST requests are supported."}
978+
979+
980+
def test_output_format_csv():
981+
client = TestClient(app)
982+
test_file = Path("sample-docs") / "family-day.eml"
983+
response = client.post(
984+
MAIN_API_ROUTE,
985+
files=[("files", (str(test_file), open(test_file, "rb")))],
986+
data={"output_format": "text/csv"},
987+
)
988+
assert response.status_code == 200
989+
df = pd.read_csv(io.StringIO(response.text))
990+
assert len(df) == 8
991+
assert df["text"][3] == "Make sure to RSVP!"
992+
993+
994+
def test_output_format_csv_ignore_specified_accept_header():
995+
client = TestClient(app)
996+
test_file = Path("sample-docs") / "family-day.eml"
997+
response = client.post(
998+
MAIN_API_ROUTE,
999+
files=[("files", (str(test_file), open(test_file, "rb")))],
1000+
data={"output_format": "text/csv"},
1001+
headers={"accept": "application/json"},
1002+
)
1003+
assert response.status_code == 200
1004+
df = pd.read_csv(io.StringIO(response.text))
1005+
assert len(df) == 8
1006+
assert df["text"][3] == "Make sure to RSVP!"

0 commit comments

Comments
 (0)