Skip to content

Commit aa4d432

Browse files
authored
fix: partition_via_api reflects actual filetype in metadata (#696)
* fix: `partition_via_api` reflects actual filetype in metadata * added in list length check * changelog typo
1 parent dabda67 commit aa4d432

File tree

3 files changed

+38
-8
lines changed

3 files changed

+38
-8
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
### Fixes
1010

11+
* `partition_via_api` reflects the actual filetype for the file processed in the API.
12+
1113
## 0.7.2
1214

1315
### Enhancements

test_unstructured/partition/test_api.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,15 @@ def text(self):
3232
"Matthew Robinson <[email protected]>"
3333
],
3434
"subject": "Test Email",
35-
"filename": "fake-email.eml"
35+
"filename": "fake-email.eml",
36+
"filetype": "message/rfc822"
3637
}
3738
}
3839
]"""
3940

41+
def json(self):
42+
return json.loads(self.text)
43+
4044

4145
def test_partition_via_api_from_filename(monkeypatch):
4246
monkeypatch.setattr(
@@ -47,6 +51,7 @@ def test_partition_via_api_from_filename(monkeypatch):
4751
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
4852
elements = partition_via_api(filename=filename, api_key="FAKEROO")
4953
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
54+
assert elements[0].metadata.filetype == "message/rfc822"
5055

5156

5257
def test_partition_via_api_from_file(monkeypatch):
@@ -60,6 +65,7 @@ def test_partition_via_api_from_file(monkeypatch):
6065
with open(filename, "rb") as f:
6166
elements = partition_via_api(file=f, file_filename=filename, api_key="FAKEROO")
6267
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
68+
assert elements[0].metadata.filetype == "message/rfc822"
6369

6470

6571
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
@@ -110,7 +116,8 @@ def text(self):
110116
"Matthew Robinson <[email protected]>"
111117
],
112118
"subject": "Test Email",
113-
"filename": "fake-email.eml"
119+
"filename": "fake-email.eml",
120+
"filetype": "message/rfc822"
114121
}
115122
}
116123
],
@@ -128,13 +135,27 @@ def text(self):
128135
"Matthew Robinson <[email protected]>"
129136
],
130137
"subject": "Test Email",
131-
"filename": "fake-email.eml"
138+
"filename": "fake-email.eml",
139+
"filetype": "message/rfc822"
132140
}
133141
}
134142
]
135143
]"""
136144

137145

146+
def test_partition_multiple_via_api_with_single_filename(monkeypatch):
147+
monkeypatch.setattr(
148+
requests,
149+
"post",
150+
lambda *args, **kwargs: MockResponse(status_code=200),
151+
)
152+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
153+
154+
elements = partition_multiple_via_api(filenames=[filename], api_key="FAKEROO")
155+
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
156+
assert elements[0][0].metadata.filetype == "message/rfc822"
157+
158+
138159
def test_partition_multiple_via_api_from_filenames(monkeypatch):
139160
monkeypatch.setattr(
140161
requests,
@@ -150,6 +171,7 @@ def test_partition_multiple_via_api_from_filenames(monkeypatch):
150171
elements = partition_multiple_via_api(filenames=filenames, api_key="FAKEROO")
151172
assert len(elements) == 2
152173
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
174+
assert elements[0][0].metadata.filetype == "message/rfc822"
153175

154176

155177
def test_partition_multiple_via_api_from_files(monkeypatch):
@@ -173,6 +195,7 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
173195
)
174196
assert len(elements) == 2
175197
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
198+
assert elements[0][0].metadata.filetype == "message/rfc822"
176199

177200

178201
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):

unstructured/partition/api.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import contextlib
2-
import json
32
from typing import (
43
IO,
54
List,
@@ -10,7 +9,7 @@
109

1110
from unstructured.documents.elements import Element
1211
from unstructured.partition.common import exactly_one
13-
from unstructured.partition.json import partition_json
12+
from unstructured.staging.base import dict_to_elements, elements_from_json
1413

1514

1615
def partition_via_api(
@@ -82,7 +81,7 @@ def partition_via_api(
8281
response = requests.post(api_url, headers=headers, data=data, files=files) # type: ignore
8382

8483
if response.status_code == 200:
85-
return partition_json(text=response.text)
84+
return elements_from_json(text=response.text)
8685
else:
8786
raise ValueError(
8887
f"Receive unexpected status code {response.status_code} from the API.",
@@ -172,8 +171,14 @@ def partition_multiple_via_api(
172171

173172
if response.status_code == 200:
174173
documents = []
175-
for document in response.json():
176-
documents.append(partition_json(text=json.dumps(document)))
174+
response_list = response.json()
175+
# NOTE(robinson) - this check is because if only one filename is passed, the return
176+
# type from the API is a list of objects instead of a list of lists
177+
if not isinstance(response_list[0], list):
178+
response_list = [response_list]
179+
180+
for document in response_list:
181+
documents.append(dict_to_elements(document))
177182
return documents
178183
else:
179184
raise ValueError(

0 commit comments

Comments
 (0)