Skip to content

Commit b8c4d94

Browse files
committed
100 percent coverage for mediadescriber.py
1 parent 2a6e604 commit b8c4d94

File tree

5 files changed

+149
-33
lines changed

5 files changed

+149
-33
lines changed

app/backend/prepdocslib/mediadescriber.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
class MediaDescriber(ABC):
1414

1515
async def describe_image(self, image_bytes) -> str:
16-
raise NotImplementedError
16+
raise NotImplementedError # pragma: no cover
1717

1818

1919
class ContentUnderstandingDescriber:
@@ -75,16 +75,15 @@ async def create_analyzer(self):
7575
return
7676
elif response.status != 201:
7777
data = await response.text()
78-
logger.error("Error creating analyzer: %s", data)
79-
response.raise_for_status()
78+
raise Exception("Error creating analyzer", data)
8079
else:
8180
poll_url = response.headers.get("Operation-Location")
8281

8382
with Progress() as progress:
8483
progress.add_task("Creating analyzer...", total=None, start=False)
8584
await self.poll_api(session, poll_url, headers)
8685

87-
async def describe_image(self, image_bytes) -> str:
86+
async def describe_image(self, image_bytes: bytes) -> str:
8887
logger.info("Sending image to Azure Content Understanding service...")
8988
async with aiohttp.ClientSession() as session:
9089
token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")

app/backend/prepdocslib/pdfparser.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import html
22
import io
3-
import json
43
import logging
54
from enum import Enum
65
from typing import IO, AsyncGenerator, Union
@@ -94,7 +93,6 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
9493
form_recognizer_results: AnalyzeResult = await poller.result()
9594

9695
offset = 0
97-
pages_json = []
9896
for page in form_recognizer_results.pages:
9997
tables_on_page = [
10098
table
@@ -162,16 +160,7 @@ class ObjectType(Enum):
162160
# We remove excess newlines at the beginning and end of the page
163161
page_text = page_text.strip()
164162
yield Page(page_num=page.page_number - 1, offset=offset, text=page_text)
165-
# Serialize the page text to a JSON and save it locally
166-
page_json = {
167-
"page_num": page.page_number - 1,
168-
"offset": offset,
169-
"text": page_text,
170-
}
171-
pages_json.append(page_json)
172163
offset += len(page_text)
173-
with open("pages.json", "w") as f:
174-
json.dump(pages_json, f)
175164

176165
@staticmethod
177166
async def figure_to_html(

tests/conftest.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
MockBlobClient,
3535
MockResponse,
3636
mock_computervision_response,
37-
mock_contentunderstanding_response,
3837
mock_speak_text_cancelled,
3938
mock_speak_text_failed,
4039
mock_speak_text_success,
@@ -59,8 +58,6 @@ def mock_azurehttp_calls(monkeypatch):
5958
def mock_post(*args, **kwargs):
6059
if kwargs.get("url").endswith("computervision/retrieval:vectorizeText"):
6160
return mock_computervision_response()
62-
elif kwargs.get("url").endswith("/contentunderstanding/analyzers/image_analyzer:analyze"):
63-
return mock_contentunderstanding_response()
6461
else:
6562
raise Exception("Unexpected URL for mock call to ClientSession.post()")
6663

tests/mocks.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -151,21 +151,26 @@ def by_page(self):
151151

152152

153153
class MockResponse:
154-
def __init__(self, text, status):
155-
self.text = text
154+
def __init__(self, status, text=None, headers=None):
155+
self._text = text or ""
156156
self.status = status
157-
158-
async def text(self):
159-
return self._text
157+
self.headers = headers or {}
160158

161159
async def __aexit__(self, exc_type, exc, tb):
162160
pass
163161

164162
async def __aenter__(self):
165163
return self
166164

165+
async def text(self):
166+
return self._text
167+
167168
async def json(self):
168-
return json.loads(self.text)
169+
return json.loads(self._text)
170+
171+
def raise_for_status(self):
172+
if self.status != 200:
173+
raise Exception(f"HTTP status {self.status}")
169174

170175

171176
class MockEmbeddingsClient:
@@ -203,15 +208,6 @@ def mock_computervision_response():
203208
)
204209

205210

206-
def mock_contentunderstanding_response():
207-
return MockResponse(
208-
status=200,
209-
headers={
210-
"Operation-Location": "https://cu-ztmfrxlgtk3nq.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview"
211-
},
212-
)
213-
214-
215211
class MockAudio:
216212
def __init__(self, audio_data):
217213
self.audio_data = audio_data

tests/test_mediadescriber.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import json
2+
import logging
3+
4+
import aiohttp
5+
import pytest
6+
7+
from prepdocslib.mediadescriber import ContentUnderstandingDescriber
8+
9+
from .mocks import MockAzureCredential, MockResponse
10+
11+
12+
@pytest.mark.asyncio
13+
async def test_contentunderstanding_analyze(monkeypatch, caplog):
14+
15+
def mock_post(*args, **kwargs):
16+
if kwargs.get("url").find("badanalyzer") > 0:
17+
return MockResponse(
18+
status=200,
19+
headers={
20+
"Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/badanalyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview"
21+
},
22+
)
23+
if kwargs.get("url").endswith("contentunderstanding/analyzers/image_analyzer:analyze"):
24+
return MockResponse(
25+
status=200,
26+
headers={
27+
"Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview"
28+
},
29+
)
30+
else:
31+
raise Exception("Unexpected URL for mock call to ClientSession.post()")
32+
33+
monkeypatch.setattr(aiohttp.ClientSession, "post", mock_post)
34+
35+
num_poll_calls = 0
36+
37+
def mock_get(self, url, **kwargs):
38+
if url.endswith(
39+
"contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview"
40+
):
41+
return MockResponse(
42+
status=200,
43+
text=json.dumps(
44+
{
45+
"id": "f8c4c1c0-71c3-410c-a723-d223e0a84a88",
46+
"status": "Succeeded",
47+
"result": {
48+
"analyzerId": "image_analyzer",
49+
"apiVersion": "2024-12-01-preview",
50+
"createdAt": "2024-12-05T17:33:04Z",
51+
"warnings": [],
52+
"contents": [
53+
{
54+
"markdown": "![image](image)\n",
55+
"fields": {
56+
"Description": {
57+
"type": "string",
58+
"valueString": "The bar chart titled 'Prices (2024 Indexed to 100)' compares the indexed prices of Oil, Bitcoin, and S&P 500 from 2024 to 2028. Each year is represented by a set of three horizontal bars, with Oil in gray, Bitcoin in orange, and S&P 500 in blue. The index is based on the year 2024, where all values start at 100. Over the years, Bitcoin shows the most significant increase, reaching around 130 by 2028, while Oil and S&P 500 show moderate increases.\n\n<table><thead><tr><td>Year</td><td>Oil</td><td>Bitcoin</td><td>S&P 500</td></tr></thead><tbody><tr><td>2024</td><td>100</td><td>100</td><td>100</td></tr><tr><td>2025</td><td>105</td><td>110</td><td>1 08</td></tr><tr><td>2026</td><td>110</td><td>115</td><td>112</td></tr><tr><td>2027</td><td>115</td><td>120</td><td>116</td></tr><tr> <td>2028</td><td>120</td><td>130</td><td>120</td></tr></tbody></table>",
59+
}
60+
},
61+
"kind": "document",
62+
"startPageNumber": 1,
63+
"endPageNumber": 1,
64+
"unit": "pixel",
65+
"pages": [{"pageNumber": 1}],
66+
}
67+
],
68+
},
69+
}
70+
),
71+
)
72+
elif url.endswith(
73+
"https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/badanalyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview"
74+
):
75+
return MockResponse(status=200, text=json.dumps({"status": "Failed"}))
76+
elif url.endswith(
77+
"https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview"
78+
):
79+
nonlocal num_poll_calls
80+
num_poll_calls += 1
81+
if num_poll_calls == 1:
82+
return MockResponse(status=200, text=json.dumps({"status": "Running"}))
83+
elif num_poll_calls > 1:
84+
return MockResponse(status=200, text=json.dumps({"status": "Succeeded"}))
85+
else:
86+
raise Exception("Unexpected URL for mock call to ClientSession.get()")
87+
88+
monkeypatch.setattr(aiohttp.ClientSession, "get", mock_get)
89+
90+
def mock_put(self, *args, **kwargs):
91+
if kwargs.get("url").find("existinganalyzer") > 0:
92+
return MockResponse(status=409)
93+
if kwargs.get("url").find("wrongservicename") > 0:
94+
return MockResponse(
95+
status=404,
96+
text=json.dumps(
97+
{"error": {"code": "ResourceNotFound", "message": "The specified resource does not exist."}}
98+
),
99+
)
100+
elif kwargs.get("url").endswith("contentunderstanding/analyzers/image_analyzer"):
101+
return MockResponse(
102+
status=201,
103+
headers={
104+
"Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview"
105+
},
106+
)
107+
else:
108+
raise Exception("Unexpected URL for mock call to ClientSession.put()")
109+
110+
monkeypatch.setattr(aiohttp.ClientSession, "put", mock_put)
111+
112+
describer = ContentUnderstandingDescriber(
113+
endpoint="https://testcontentunderstanding.cognitiveservices.azure.com", credential=MockAzureCredential()
114+
)
115+
await describer.create_analyzer()
116+
await describer.describe_image(b"imagebytes")
117+
118+
describer_wrong_endpoint = ContentUnderstandingDescriber(
119+
endpoint="https://wrongservicename.cognitiveservices.azure.com", credential=MockAzureCredential()
120+
)
121+
with pytest.raises(Exception):
122+
await describer_wrong_endpoint.create_analyzer()
123+
124+
describer_existing_analyzer = ContentUnderstandingDescriber(
125+
endpoint="https://existinganalyzer.cognitiveservices.azure.com", credential=MockAzureCredential()
126+
)
127+
with caplog.at_level(logging.INFO):
128+
await describer_existing_analyzer.create_analyzer()
129+
assert "Analyzer 'image_analyzer' already exists." in caplog.text
130+
131+
describer_bad_analyze = ContentUnderstandingDescriber(
132+
endpoint="https://badanalyzer.cognitiveservices.azure.com", credential=MockAzureCredential()
133+
)
134+
with pytest.raises(Exception):
135+
await describer_bad_analyze.describe_image(b"imagebytes")

0 commit comments

Comments
 (0)