Skip to content

Commit 889af42

Browse files
committed
added url in post logic and test using url
1 parent 124c562 commit 889af42

File tree

4 files changed

+382
-288
lines changed

4 files changed

+382
-288
lines changed

src/unstract/llmwhisperer/client_v2.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,13 +151,13 @@ def whisper(
151151
file_path: str = "",
152152
stream: IO[bytes] = None,
153153
url: str = "",
154-
mode: str = "high_quality",
154+
mode: str = "form",
155155
output_mode: str = "layout_preserving",
156156
page_seperator: str = "<<<",
157157
pages_to_extract: str = "",
158158
median_filter_size: int = 0,
159159
gaussian_blur_radius: int = 0,
160-
line_splitter_tolerance: float = 0.75,
160+
line_splitter_tolerance: float = 0.4,
161161
horizontal_stretch_factor: float = 1.0,
162162
mark_vertical_lines: bool = False,
163163
mark_horizontal_lines: bool = False,
@@ -178,7 +178,7 @@ def whisper(
178178
file_path (str, optional): The path to the file to be processed. Defaults to "".
179179
stream (IO[bytes], optional): A stream of bytes to be processed. Defaults to None.
180180
url (str, optional): The URL of the file to be processed. Defaults to "".
181-
mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text". Defaults to "high_quality".
181+
mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text". Defaults to "form".
182182
output_mode (str, optional): The output mode. Can be "layout_preserving" or "text". Defaults to "layout_preserving".
183183
page_seperator (str, optional): The page separator. Defaults to "<<<".
184184
pages_to_extract (str, optional): The pages to extract. Defaults to "".
@@ -207,7 +207,6 @@ def whisper(
207207
self.logger.debug("whisper called")
208208
api_url = f"{self.base_url}/whisper"
209209
params = {
210-
"url": url,
211210
"mode": mode,
212211
"output_mode": output_mode,
213212
"page_seperator": page_seperator,
@@ -272,7 +271,8 @@ def generate():
272271
data=data,
273272
)
274273
else:
275-
req = requests.Request("POST", api_url, params=params, headers=self.headers)
274+
params["url_in_post"] = True
275+
req = requests.Request("POST", api_url, params=params, headers=self.headers, data=url)
276276
prepared = req.prepare()
277277
s = requests.Session()
278278
response = s.send(prepared, timeout=120, stream=should_stream)
@@ -340,7 +340,7 @@ def generate():
340340
return message
341341

342342
# Will not reach here if status code is 202
343-
message = response.text
343+
message = json.loads(response.text)
344344
message["status_code"] = response.status_code
345345
return message
346346

tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import os
22

33
import pytest
4+
from dotenv import load_dotenv
45

56
from unstract.llmwhisperer.client import LLMWhispererClient
67
from unstract.llmwhisperer.client_v2 import LLMWhispererClientV2
78

9+
load_dotenv()
10+
811

912
@pytest.fixture(name="client")
1013
def llm_whisperer_client():

tests/integration/client_v2_test.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,72 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
6969
unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
7070
)
7171
pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")
72+
73+
74+
@pytest.mark.parametrize(
75+
"output_mode, mode, url, input_file, page_count",
76+
[
77+
("layout_preserving", "native_text", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
78+
"credit_card.pdf", 7),
79+
("layout_preserving", "low_cost", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
80+
"credit_card.pdf", 7),
81+
(
82+
"layout_preserving", "high_quality",
83+
"https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf",
84+
"restaurant_invoice_photo.pdf", 1),
85+
("layout_preserving", "form", "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf",
86+
"handwritten-form.pdf", 1),
87+
]
88+
)
89+
def test_whisper_v2_url_in_post(client_v2, data_dir, output_mode, mode, url, input_file, page_count):
90+
usage_before = client_v2.get_usage_info()
91+
whisper_result = client_v2.whisper(
92+
mode=mode, output_mode=output_mode, url=url, wait_for_completion=True
93+
)
94+
logger.debug(f"Result for '{output_mode}', '{mode}', " f"'{input_file}: {whisper_result}")
95+
96+
exp_basename = f"{Path(input_file).stem}.{mode}.{output_mode}.txt"
97+
exp_file = os.path.join(data_dir, "expected", exp_basename)
98+
with open(exp_file, encoding="utf-8") as f:
99+
exp = f.read()
100+
101+
assert isinstance(whisper_result, dict)
102+
assert whisper_result["status_code"] == 200
103+
104+
# For text based processing, perform a strict match
105+
if mode == "native_text" and output_mode == "text":
106+
assert whisper_result["extraction"]["result_text"] == exp
107+
# For OCR based processing, perform a fuzzy match
108+
else:
109+
extracted_text = whisper_result["extraction"]["result_text"]
110+
similarity = SequenceMatcher(None, extracted_text, exp).ratio()
111+
threshold = 0.97
112+
113+
if similarity < threshold:
114+
diff = "\n".join(
115+
unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
116+
)
117+
pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")
118+
119+
usage_after = client_v2.get_usage_info()
120+
# Verify usage after extraction
121+
verify_usage(usage_before, usage_after, page_count, mode)
122+
123+
124+
def verify_usage(before_extract, after_extract, page_count, mode='form'):
125+
all_modes = ['form', 'high_quality', 'low_cost', 'native_text']
126+
all_modes.remove(mode)
127+
assert (after_extract['today_page_count'] == before_extract['today_page_count'] + page_count), \
128+
"today_page_count calculation is wrong"
129+
if after_extract['current_page_count'] != -1:
130+
assert (after_extract['current_page_count'] == before_extract['current_page_count'] + page_count), \
131+
"current_page_count calculation is wrong"
132+
if after_extract['overage_page_count'] > 0:
133+
assert (after_extract['overage_page_count'] == before_extract['overage_page_count'] + page_count), \
134+
"overage_page_count calculation is wrong"
135+
assert (after_extract[f'current_page_count_{mode}'] == before_extract[f'current_page_count_{mode}'] + page_count), \
136+
f"{mode} mode calculation is wrong"
137+
for i in range(len(all_modes)):
138+
assert (after_extract[f'current_page_count_{all_modes[i]}'] ==
139+
before_extract[f'current_page_count_{all_modes[i]}']), \
140+
f"{all_modes[i]} mode calculation is wrong"

0 commit comments

Comments
 (0)