Skip to content

Commit b5866aa

Browse files
Encoding param added for response and tests added, defaults to utf-8
1 parent 124c562 commit b5866aa

File tree

8 files changed

+396
-15
lines changed

8 files changed

+396
-15
lines changed

src/unstract/llmwhisperer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.21.0"
1+
__version__ = "0.22.0"
22

33
from .client import LLMWhispererClient # noqa: F401
44

src/unstract/llmwhisperer/client.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,7 @@ class LLMWhispererClient:
5858
client's activities and errors.
5959
"""
6060

61-
formatter = logging.Formatter(
62-
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
63-
)
61+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
6462
logger = logging.getLogger(__name__)
6563
log_stream_handler = logging.StreamHandler()
6664
log_stream_handler.setFormatter(formatter)
@@ -117,9 +115,7 @@ def __init__(
117115
self.api_key = os.getenv("LLMWHISPERER_API_KEY", "")
118116
else:
119117
self.api_key = api_key
120-
self.logger.debug(
121-
"api_key set to %s", LLMWhispererUtils.redact_key(self.api_key)
122-
)
118+
self.logger.debug("api_key set to %s", LLMWhispererUtils.redact_key(self.api_key))
123119

124120
self.api_timeout = api_timeout
125121

@@ -169,6 +165,7 @@ def whisper(
169165
ocr_provider: str = "advanced",
170166
line_splitter_tolerance: float = 0.4,
171167
horizontal_stretch_factor: float = 1.0,
168+
encoding: str = "utf-8",
172169
) -> dict:
173170
"""
174171
Sends a request to the LLMWhisperer API to process a document.
@@ -190,6 +187,7 @@ def whisper(
190187
ocr_provider (str, optional): The OCR provider. Can be "advanced" or "basic". Defaults to "advanced".
191188
line_splitter_tolerance (float, optional): The line splitter tolerance. Defaults to 0.4.
192189
horizontal_stretch_factor (float, optional): The horizontal stretch factor. Defaults to 1.0.
190+
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".
193191
194192
Returns:
195193
dict: The response from the API as a dictionary.
@@ -238,12 +236,10 @@ def whisper(
238236
should_stream = False
239237
if url == "":
240238
if stream is not None:
241-
242239
should_stream = True
243240

244241
def generate():
245-
for chunk in stream:
246-
yield chunk
242+
yield from stream
247243

248244
req = requests.Request(
249245
"POST",
@@ -268,6 +264,7 @@ def generate():
268264
prepared = req.prepare()
269265
s = requests.Session()
270266
response = s.send(prepared, timeout=self.api_timeout, stream=should_stream)
267+
response.encoding = encoding
271268
if response.status_code != 200 and response.status_code != 202:
272269
message = json.loads(response.text)
273270
message["status_code"] = response.status_code
@@ -318,7 +315,7 @@ def whisper_status(self, whisper_hash: str) -> dict:
318315
message["status_code"] = response.status_code
319316
return message
320317

321-
def whisper_retrieve(self, whisper_hash: str) -> dict:
318+
def whisper_retrieve(self, whisper_hash: str, encoding: str = "utf-8") -> dict:
322319
"""Retrieves the result of the whisper operation from the LLMWhisperer
323320
API.
324321
@@ -329,6 +326,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
329326
330327
Args:
331328
whisper_hash (str): The hash of the whisper operation.
329+
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".
332330
333331
Returns:
334332
dict: A dictionary containing the status code and the extracted text from the whisper operation.
@@ -345,6 +343,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
345343
prepared = req.prepare()
346344
s = requests.Session()
347345
response = s.send(prepared, timeout=self.api_timeout)
346+
response.encoding = encoding
348347
if response.status_code != 200:
349348
err = json.loads(response.text)
350349
err["status_code"] = response.status_code

src/unstract/llmwhisperer/client_v2.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def whisper(
169169
use_webhook="",
170170
wait_for_completion=False,
171171
wait_timeout=180,
172+
encoding: str = "utf-8",
172173
) -> dict:
173174
"""
174175
Sends a request to the LLMWhisperer API to process a document.
@@ -196,6 +197,7 @@ def whisper(
196197
use_webhook (str, optional): Webhook name to call. Defaults to "". If not provided, the no webhook will be called.
197198
wait_for_completion (bool, optional): Whether to wait for the whisper operation to complete. Defaults to False.
198199
wait_timeout (int, optional): The number of seconds to wait for the whisper operation to complete. Defaults to 180.
200+
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".
199201
200202
Returns:
201203
dict: The response from the API as a dictionary.
@@ -276,6 +278,7 @@ def generate():
276278
prepared = req.prepare()
277279
s = requests.Session()
278280
response = s.send(prepared, timeout=120, stream=should_stream)
281+
response.encoding = encoding
279282
if response.status_code != 200 and response.status_code != 202:
280283
message = json.loads(response.text)
281284
message["status_code"] = response.status_code
@@ -380,7 +383,7 @@ def whisper_status(self, whisper_hash: str) -> dict:
380383
message["status_code"] = response.status_code
381384
return message
382385

383-
def whisper_retrieve(self, whisper_hash: str) -> dict:
386+
def whisper_retrieve(self, whisper_hash: str, encoding: str = "utf-8") -> dict:
384387
"""Retrieves the result of the whisper operation from the LLMWhisperer
385388
API.
386389
@@ -391,6 +394,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
391394
392395
Args:
393396
whisper_hash (str): The hash of the whisper operation.
397+
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".
394398
395399
Returns:
396400
dict: A dictionary containing the status code and the extracted text from the whisper operation.
@@ -407,6 +411,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
407411
prepared = req.prepare()
408412
s = requests.Session()
409413
response = s.send(prepared, timeout=120)
414+
response.encoding = encoding
410415
if response.status_code != 200:
411416
err = json.loads(response.text)
412417
err["status_code"] = response.status_code
@@ -493,9 +498,8 @@ def get_highlight_rect(
493498
target_width: int,
494499
target_height: int,
495500
) -> tuple[int, int, int, int, int]:
496-
"""
497-
Given the line metadata and the line number, this function returns the bounding box of the line
498-
in the format (page,x1,y1,x2,y2)
501+
"""Given the line metadata and the line number, this function returns
502+
the bounding box of the line in the format (page,x1,y1,x2,y2)
499503
500504
Args:
501505
line_metadata (list[int]): The line metadata returned by the LLMWhisperer API.

tests/integration/client_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def test_get_usage_info(client):
3535
("ocr", "text", "restaurant_invoice_photo.pdf"),
3636
("text", "line-printer", "restaurant_invoice_photo.pdf"),
3737
("text", "text", "handwritten-form.pdf"),
38+
("ocr", "line-printer", "utf_8_chars.pdf"),
3839
],
3940
)
4041
def test_whisper(client, data_dir, processing_mode, output_mode, input_file):

tests/integration/client_v2_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def test_get_usage_info(client_v2):
3838
("text", "low_cost", "credit_card.pdf"),
3939
("text", "high_quality", "restaurant_invoice_photo.pdf"),
4040
("text", "form", "handwritten-form.pdf"),
41+
("layout_preserving", "high_quality", "utf_8_chars.pdf"),
4142
],
4243
)
4344
def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
2+
3+
4+
TCPDF Example 008
5+
TCPDF by Nicola Asuni - Tecnick.com
6+
www.tcpdf.org
7+
8+
9+
Sentences that contain all letters commonly used in a language
10+
11+
12+
This file is UTF-8 encoded.
13+
14+
15+
Czech (cz)
16+
17+
18+
Příšerně žluťoučký kůň úpěl ďábelské ódy.
19+
Hleď, toť přízračný kůň v mátožné póze šíleně úpí.
20+
Zvlášť zákeřný učeň s ďolíčky běží podél zóny úlů.
21+
Loď čeří kýlem tůň obzvlášť v Grónské úžině.
22+
Ó, náhlý déšť již zvířil prach a čilá laň teď běží s houfcem gazel k úkrytům.
23+
24+
25+
Danish (da)
26+
27+
28+
Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
29+
Wolther spillede på xylofon.
30+
(= Quiz contestants were eating strawbery with cream while Wolther
31+
the circus clown played on xylophone.)
32+
33+
34+
German (de)
35+
36+
37+
Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
38+
(= Wrongful practicing of xylophone music tortures every larger dwarf)
39+
40+
41+
Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
42+
(= Twelve boxing fighters hunted Eva across the dike of Sylt)
43+
44+
45+
Heizölrückstoßabdämpfung
46+
(= fuel oil recoil absorber)
47+
(jqvwxy missing, but all non-ASCII letters in one word)
48+
49+
50+
English (en)
51+
52+
53+
The quick brown fox jumps over the lazy dog
54+
55+
56+
Spanish (es)
57+
58+
59+
El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
60+
frío, añoraba a su querido cachorro.
61+
(Contains every letter and every accent, but not every combination
62+
63+
64+
page 1 / 3
65+
<<<
66+
67+
68+
69+
TCPDF Example 008
70+
TCPDF by Nicola Asuni - Tecnick.com
71+
www.tcpdf.org
72+
73+
74+
of vowel + acute.)
75+
76+
77+
French (fr)
78+
79+
80+
Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
81+
côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
82+
qui lui permet de penser à la cænogenèse de l'être dont il est question
83+
dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
84+
pense-t-il, diminue çà et là la qualité de son œuvre.
85+
86+
87+
l'île exiguë
88+
Où l'obèse jury mûr
89+
Fête l'haï volapük,
90+
Âne ex aéquo au whist,
91+
Ôtez ce vœu déçu.
92+
93+
94+
Le cœur déçu mais l'âme plutôt naïve, Louys rêva de crapaüter en
95+
canoë au delà des îles, près du mälström où brûlent les novæ.
96+
97+
98+
Irish Gaelic (ga)
99+
100+
101+
D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
102+
103+
104+
Hungarian (hu)
105+
106+
107+
Árvíztűrő tükörfúrógép
108+
(= flood-proof mirror-drilling machine, only all non-ASCII letters)
109+
110+
111+
Icelandic (is)
112+
113+
114+
Kæmi ný öxi hér ykist bjófum nú bæỗi víl og ádrepa
115+
116+
117+
Sævör grét áðan því úlpan var ónýt
118+
(some ASCII letters missing)
119+
120+
121+
Greek (el)
122+
123+
124+
Γαζέες και μυρτιές δέν θά βρώ στό χρυσαφί ξέφωτο
125+
(= No more shall I see acacias or myrtles in the golden clearing)
126+
127+
128+
Ξεσκεπάζω την ψυχοφθόρα βδελυγμία
129+
130+
131+
page 2 / 3
132+
<<<
133+
134+
135+
136+
TCPDF Example 008
137+
TCPDF by Nicola Asuni - Tecnick.com
138+
www.tcpdf.org
139+
140+
141+
(= I uncover the soul-destroying abhorrence)
142+
143+
144+
Hebrew (iw)
145+
146+
147+
הקליטה איך חברה לו מצא ולפתע מאוכזב בים שט סקרן דג ?
148+
149+
150+
Polish (pl)
151+
152+
153+
Pchnąć w tę łódź jeża lub osiem skrzyń fig
154+
(= To push a hedgehog or eight bins of figs in this boat)
155+
156+
157+
Zażółć gęślą jaźń
158+
159+
160+
Russian (ru)
161+
162+
163+
В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
164+
(= Would a citrus live in the bushes of south? Yes, but only a fake one!)
165+
166+
167+
Thai (th)
168+
169+
170+
[- -]
171+
เป็น มนุษย์ สุดประเสริฐ เลิศ คุณค่า กว่า บรรดา ฝูง สัตว์ เดรัจฉาน
172+
จง ฝ่าฟัน พัฒนา วิชาการ อย่า ล้าง ผลาญ ฤา เข่น ฆ่า บีฑา ใคร
173+
ไม่ ถือ โทษ โกรธ แช่ง ซัด ฮึดฮัด ด่า หัด อภัย เหมือน กีฬา อัชฌาสัย
174+
ปฏิบัติ ประพฤติ กฎ กำหนด ใจ พูดจา ให้ จ๊ะๆ จำๆ น่า ฟัง เอย ฯ
175+
176+
177+
[The copyright for the Thai example is owned by The Computer
178+
Association of Thailand under the Royal Patronage of His Majesty the
179+
King.]
180+
181+
182+
Please let me know if you find others! Special thanks to the people
183+
from all over the world who contributed these sentences.
184+
185+
186+
page 3 / 3
187+
<<<
188+

0 commit comments

Comments
 (0)