Skip to content

Commit 8abf1f1

Browse files
authored
feat: partition image (#144)
Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
1 parent 419c086 commit 8abf1f1

File tree

12 files changed

+277
-66
lines changed

12 files changed

+277
-66
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ jobs:
9898
source .venv/bin/activate
9999
make install-nltk-models
100100
make install-detectron2
101-
sudo apt-get install -y libmagic-dev poppler-utils
101+
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr
102102
make test
103103
make check-coverage
104104

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.4.2-dev0
2+
* Added `partition_image` to process documents in an image format.
3+
4+
15
## 0.4.1
26

37
* Added support for text files in the `partition` function

requirements/local-inference.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
unstructured-inference>=0.2.1
1+
unstructured-inference>=0.2.2

requirements/local-inference.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ typing-extensions==4.4.0
150150
# starlette
151151
# torch
152152
# torchvision
153-
unstructured-inference==0.2.1
153+
unstructured-inference==0.2.2
154154
# via -r requirements/local-inference.in
155155
urllib3==1.26.13
156156
# via requests

test_unstructured/partition/test_auto.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,19 @@ def test_auto_partition_pdf_from_file():
151151
assert len(elements) > 0
152152

153153

154+
def test_auto_partition_jpg():
155+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
156+
elements = partition(filename=filename)
157+
assert len(elements) > 0
158+
159+
160+
def test_auto_partition_jpg_from_file():
161+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
162+
with open(filename, "rb") as f:
163+
elements = partition(file=f)
164+
assert len(elements) > 0
165+
166+
154167
def test_auto_partition_raises_with_bad_type(monkeypatch):
155168
monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
156169
with pytest.raises(ValueError):
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import pytest
2+
import requests
3+
from unittest import mock
4+
5+
import unstructured.partition.pdf as pdf
6+
import unstructured.partition.image as image
7+
import unstructured_inference.inference.layout as layout
8+
9+
10+
class MockResponse:
11+
def __init__(self, status_code, response):
12+
self.status_code = status_code
13+
self.response = response
14+
15+
def json(self):
16+
return self.response
17+
18+
19+
def mock_healthy_get(url, **kwargs):
20+
return MockResponse(status_code=200, response={})
21+
22+
23+
def mock_unhealthy_get(url, **kwargs):
24+
return MockResponse(status_code=500, response={})
25+
26+
27+
def mock_unsuccessful_post(url, **kwargs):
28+
return MockResponse(status_code=500, response={})
29+
30+
31+
def mock_successful_post(url, **kwargs):
32+
response = {
33+
"pages": [
34+
{
35+
"number": 0,
36+
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
37+
}
38+
]
39+
}
40+
return MockResponse(status_code=200, response=response)
41+
42+
43+
class MockPageLayout(layout.PageLayout):
44+
def __init__(self, number: int):
45+
pass
46+
47+
@property
48+
def elements(self):
49+
return [
50+
layout.LayoutElement(
51+
type="Title",
52+
coordinates=[(0, 0), (2, 2)],
53+
text="Charlie Brown and the Great Pumpkin",
54+
)
55+
]
56+
57+
58+
class MockDocumentLayout(layout.DocumentLayout):
59+
@property
60+
def pages(self):
61+
return [
62+
MockPageLayout(
63+
number=0,
64+
)
65+
]
66+
67+
68+
def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
69+
monkeypatch.setattr(requests, "post", mock_successful_post)
70+
monkeypatch.setattr(requests, "get", mock_healthy_get)
71+
72+
partition_image_response = pdf._partition_via_api(filename)
73+
assert partition_image_response[0]["type"] == "Title"
74+
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
75+
76+
77+
@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")])
78+
def test_partition_image_local(monkeypatch, filename, file):
79+
monkeypatch.setattr(
80+
layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
81+
)
82+
monkeypatch.setattr(
83+
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
84+
)
85+
86+
partition_image_response = pdf._partition_pdf_or_image_local(filename, file, is_image=True)
87+
assert partition_image_response[0].type == "Title"
88+
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
89+
90+
91+
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
92+
def test_partition_image_local_raises_with_no_filename():
93+
with pytest.raises(FileNotFoundError):
94+
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
95+
96+
97+
def test_partition_image_api_raises_with_failed_healthcheck(
98+
monkeypatch, filename="example-docs/example.jpg"
99+
):
100+
monkeypatch.setattr(requests, "post", mock_successful_post)
101+
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
102+
103+
with pytest.raises(ValueError):
104+
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
105+
106+
107+
def test_partition_image_api_raises_with_failed_api_call(
108+
monkeypatch, filename="example-docs/example.jpg"
109+
):
110+
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
111+
monkeypatch.setattr(requests, "get", mock_healthy_get)
112+
113+
with pytest.raises(ValueError):
114+
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
115+
116+
117+
@pytest.mark.parametrize(
118+
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
119+
)
120+
def test_partition_image(url, api_called, local_called):
121+
with mock.patch.object(
122+
pdf, attribute="_partition_via_api", new=mock.MagicMock()
123+
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
124+
image.partition_image(filename="fake.pdf", url=url)
125+
assert pdf._partition_via_api.called == api_called
126+
assert pdf._partition_pdf_or_image_local.called == local_called

test_unstructured/partition/test_pdf.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
6868
monkeypatch.setattr(requests, "post", mock_successful_post)
6969
monkeypatch.setattr(requests, "get", mock_healthy_get)
7070

71-
partition_pdf_response = pdf._partition_pdf_via_api(filename)
71+
partition_pdf_response = pdf._partition_via_api(filename)
7272
assert partition_pdf_response[0]["type"] == "Title"
7373
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
7474

@@ -77,12 +77,14 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
7777
"filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
7878
)
7979
def test_partition_pdf_local(monkeypatch, filename, file):
80-
monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
80+
monkeypatch.setattr(
81+
layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
82+
)
8183
monkeypatch.setattr(
8284
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
8385
)
8486

85-
partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
87+
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
8688
assert partition_pdf_response[0].type == "Title"
8789
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
8890

@@ -92,15 +94,12 @@ def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
9294
monkeypatch.setattr(requests, "get", mock_healthy_get)
9395

9496
with pytest.raises(FileNotFoundError):
95-
pdf._partition_pdf_via_api(filename=None, file=None)
96-
97+
pdf._partition_via_api(filename=None, file=None)
9798

98-
def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
99-
monkeypatch.setattr(requests, "post", mock_successful_post)
100-
monkeypatch.setattr(requests, "get", mock_healthy_get)
10199

100+
def test_partition_pdf_local_raises_with_no_filename():
102101
with pytest.raises(FileNotFoundError):
103-
pdf._partition_pdf_via_api(filename=None, file=None)
102+
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
104103

105104

106105
def test_partition_pdf_api_raises_with_failed_healthcheck(
@@ -110,7 +109,7 @@ def test_partition_pdf_api_raises_with_failed_healthcheck(
110109
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
111110

112111
with pytest.raises(ValueError):
113-
pdf._partition_pdf_via_api(filename=filename)
112+
pdf._partition_via_api(filename=filename)
114113

115114

116115
def test_partition_pdf_api_raises_with_failed_api_call(
@@ -120,16 +119,16 @@ def test_partition_pdf_api_raises_with_failed_api_call(
120119
monkeypatch.setattr(requests, "get", mock_healthy_get)
121120

122121
with pytest.raises(ValueError):
123-
pdf._partition_pdf_via_api(filename=filename)
122+
pdf._partition_via_api(filename=filename)
124123

125124

126125
@pytest.mark.parametrize(
127126
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
128127
)
129128
def test_partition_pdf(url, api_called, local_called):
130-
with mock.patch(
131-
"unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
132-
), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
129+
with mock.patch.object(
130+
pdf, attribute="_partition_via_api", new=mock.MagicMock()
131+
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
133132
pdf.partition_pdf(filename="fake.pdf", url=url)
134-
assert pdf._partition_pdf_via_api.called == api_called
135-
assert pdf._partition_pdf_via_local.called == local_called
133+
assert pdf._partition_via_api.called == api_called
134+
assert pdf._partition_pdf_or_image_local.called == local_called

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.1" # pragma: no cover
1+
__version__ = "0.4.2-dev0" # pragma: no cover

unstructured/partition/__init__.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import requests # type: ignore
2+
from typing import BinaryIO, List, Optional, Union, Tuple, Mapping
3+
4+
from unstructured.documents.elements import Element
5+
6+
7+
def _partition_via_api(
8+
filename: str = "",
9+
file: Optional[bytes] = None,
10+
url: str = "https://ml.unstructured.io/layout/pdf",
11+
token: Optional[str] = None,
12+
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
13+
) -> List[Element]:
14+
"""Use API for partitioning."""
15+
if not filename and not file:
16+
raise FileNotFoundError("No filename nor file were specified")
17+
18+
healthcheck_response = requests.models.Response()
19+
if not token:
20+
healthcheck_response = requests.get(url=f"{url}healthcheck")
21+
22+
if healthcheck_response.status_code != 200:
23+
raise ValueError("endpoint api healthcheck has failed!")
24+
25+
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
26+
"file": (
27+
filename,
28+
file if file else open(filename, "rb"),
29+
)
30+
}
31+
response = requests.post(
32+
url=url,
33+
headers={"Authorization": f"Bearer {token}" if token else ""},
34+
files=file_,
35+
data=data, # NOTE(alan): Remove after unstructured API is using routing
36+
)
37+
38+
if response.status_code == 200:
39+
pages = response.json()["pages"]
40+
return [element for page in pages for element in page["elements"]]
41+
else:
42+
raise ValueError(f"response status code = {response.status_code}")

unstructured/partition/auto.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from unstructured.partition.email import partition_email
66
from unstructured.partition.html import partition_html
77
from unstructured.partition.pdf import partition_pdf
8+
from unstructured.partition.image import partition_image
89
from unstructured.partition.text import partition_text
910

1011

@@ -34,6 +35,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
3435
return partition_html(filename=filename, file=file)
3536
elif filetype == FileType.PDF:
3637
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
38+
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
39+
return partition_image(filename=filename, file=file, url=None) # type: ignore
3740
elif filetype == FileType.TXT:
3841
return partition_text(filename=filename, file=file)
3942
else:

0 commit comments

Comments
 (0)