Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/unstract/llmwhisperer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.21.0"
__version__ = "0.22.0"

from .client import LLMWhispererClient # noqa: F401

Expand Down
19 changes: 9 additions & 10 deletions src/unstract/llmwhisperer/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,7 @@ class LLMWhispererClient:
client's activities and errors.
"""

formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
log_stream_handler = logging.StreamHandler()
log_stream_handler.setFormatter(formatter)
Expand Down Expand Up @@ -117,9 +115,7 @@ def __init__(
self.api_key = os.getenv("LLMWHISPERER_API_KEY", "")
else:
self.api_key = api_key
self.logger.debug(
"api_key set to %s", LLMWhispererUtils.redact_key(self.api_key)
)
self.logger.debug("api_key set to %s", LLMWhispererUtils.redact_key(self.api_key))

self.api_timeout = api_timeout

Expand Down Expand Up @@ -169,6 +165,7 @@ def whisper(
ocr_provider: str = "advanced",
line_splitter_tolerance: float = 0.4,
horizontal_stretch_factor: float = 1.0,
encoding: str = "utf-8",
) -> dict:
"""
Sends a request to the LLMWhisperer API to process a document.
Expand All @@ -190,6 +187,7 @@ def whisper(
ocr_provider (str, optional): The OCR provider. Can be "advanced" or "basic". Defaults to "advanced".
line_splitter_tolerance (float, optional): The line splitter tolerance. Defaults to 0.4.
horizontal_stretch_factor (float, optional): The horizontal stretch factor. Defaults to 1.0.
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".

Returns:
dict: The response from the API as a dictionary.
Expand Down Expand Up @@ -238,12 +236,10 @@ def whisper(
should_stream = False
if url == "":
if stream is not None:

should_stream = True

def generate():
for chunk in stream:
yield chunk
yield from stream

req = requests.Request(
"POST",
Expand All @@ -268,6 +264,7 @@ def generate():
prepared = req.prepare()
s = requests.Session()
response = s.send(prepared, timeout=self.api_timeout, stream=should_stream)
response.encoding = encoding
if response.status_code != 200 and response.status_code != 202:
message = json.loads(response.text)
message["status_code"] = response.status_code
Expand Down Expand Up @@ -318,7 +315,7 @@ def whisper_status(self, whisper_hash: str) -> dict:
message["status_code"] = response.status_code
return message

def whisper_retrieve(self, whisper_hash: str) -> dict:
def whisper_retrieve(self, whisper_hash: str, encoding: str = "utf-8") -> dict:
"""Retrieves the result of the whisper operation from the LLMWhisperer
API.

Expand All @@ -329,6 +326,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:

Args:
whisper_hash (str): The hash of the whisper operation.
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".

Returns:
dict: A dictionary containing the status code and the extracted text from the whisper operation.
Expand All @@ -345,6 +343,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
prepared = req.prepare()
s = requests.Session()
response = s.send(prepared, timeout=self.api_timeout)
response.encoding = encoding
if response.status_code != 200:
err = json.loads(response.text)
err["status_code"] = response.status_code
Expand Down
12 changes: 8 additions & 4 deletions src/unstract/llmwhisperer/client_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def whisper(
use_webhook="",
wait_for_completion=False,
wait_timeout=180,
encoding: str = "utf-8",
) -> dict:
"""
Sends a request to the LLMWhisperer API to process a document.
Expand Down Expand Up @@ -196,6 +197,7 @@ def whisper(
use_webhook (str, optional): Webhook name to call. Defaults to "". If not provided, the no webhook will be called.
wait_for_completion (bool, optional): Whether to wait for the whisper operation to complete. Defaults to False.
wait_timeout (int, optional): The number of seconds to wait for the whisper operation to complete. Defaults to 180.
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".

Returns:
dict: The response from the API as a dictionary.
Expand Down Expand Up @@ -276,6 +278,7 @@ def generate():
prepared = req.prepare()
s = requests.Session()
response = s.send(prepared, timeout=120, stream=should_stream)
response.encoding = encoding
if response.status_code != 200 and response.status_code != 202:
message = json.loads(response.text)
message["status_code"] = response.status_code
Expand Down Expand Up @@ -380,7 +383,7 @@ def whisper_status(self, whisper_hash: str) -> dict:
message["status_code"] = response.status_code
return message

def whisper_retrieve(self, whisper_hash: str) -> dict:
def whisper_retrieve(self, whisper_hash: str, encoding: str = "utf-8") -> dict:
"""Retrieves the result of the whisper operation from the LLMWhisperer
API.

Expand All @@ -391,6 +394,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:

Args:
whisper_hash (str): The hash of the whisper operation.
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".

Returns:
dict: A dictionary containing the status code and the extracted text from the whisper operation.
Expand All @@ -407,6 +411,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
prepared = req.prepare()
s = requests.Session()
response = s.send(prepared, timeout=120)
response.encoding = encoding
if response.status_code != 200:
err = json.loads(response.text)
err["status_code"] = response.status_code
Expand Down Expand Up @@ -493,9 +498,8 @@ def get_highlight_rect(
target_width: int,
target_height: int,
) -> tuple[int, int, int, int, int]:
"""
Given the line metadata and the line number, this function returns the bounding box of the line
in the format (page,x1,y1,x2,y2)
"""Given the line metadata and the line number, this function returns
the bounding box of the line in the format (page,x1,y1,x2,y2)

Args:
line_metadata (list[int]): The line metadata returned by the LLMWhisperer API.
Expand Down
1 change: 1 addition & 0 deletions tests/integration/client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def test_get_usage_info(client):
("ocr", "text", "restaurant_invoice_photo.pdf"),
("text", "line-printer", "restaurant_invoice_photo.pdf"),
("text", "text", "handwritten-form.pdf"),
("ocr", "line-printer", "utf_8_chars.pdf"),
],
)
def test_whisper(client, data_dir, processing_mode, output_mode, input_file):
Expand Down
1 change: 1 addition & 0 deletions tests/integration/client_v2_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def test_get_usage_info(client_v2):
("text", "low_cost", "credit_card.pdf"),
("text", "high_quality", "restaurant_invoice_photo.pdf"),
("text", "form", "handwritten-form.pdf"),
("layout_preserving", "high_quality", "utf_8_chars.pdf"),
],
)
def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@



TCPDF Example 008
TCPDF by Nicola Asuni - Tecnick.com
www.tcpdf.org


Sentences that contain all letters commonly used in a language


This file is UTF-8 encoded.


Czech (cz)


Příšerně žluťoučký kůň úpěl ďábelské ódy.
Hleď, toť přízračný kůň v mátožné póze šíleně úpí.
Zvlášť zákeřný učeň s ďolíčky běží podél zóny úlů.
Loď čeří kýlem tůň obzvlášť v Grónské úžině.
Ó, náhlý déšť již zvířil prach a čilá laň teď běží s houfcem gazel k úkrytům.


Danish (da)


Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
Wolther spillede på xylofon.
(= Quiz contestants were eating strawbery with cream while Wolther
the circus clown played on xylophone.)


German (de)


Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
(= Wrongful practicing of xylophone music tortures every larger dwarf)


Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
(= Twelve boxing fighters hunted Eva across the dike of Sylt)


Heizölrückstoßabdämpfung
(= fuel oil recoil absorber)
(jqvwxy missing, but all non-ASCII letters in one word)


English (en)


The quick brown fox jumps over the lazy dog


Spanish (es)


El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y
frío, añoraba a su querido cachorro.
(Contains every letter and every accent, but not every combination


page 1 / 3
<<<



TCPDF Example 008
TCPDF by Nicola Asuni - Tecnick.com
www.tcpdf.org


of vowel + acute.)


French (fr)


Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
qui lui permet de penser à la cænogenèse de l'être dont il est question
dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
pense-t-il, diminue çà et là la qualité de son œuvre.


l'île exiguë
Où l'obèse jury mûr
Fête l'haï volapük,
Âne ex aéquo au whist,
Ôtez ce vœu déçu.


Le cœur déçu mais l'âme plutôt naïve, Louys rêva de crapaüter en
canoë au delà des îles, près du mälström où brûlent les novæ.


Irish Gaelic (ga)


D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh


Hungarian (hu)


Árvíztűrő tükörfúrógép
(= flood-proof mirror-drilling machine, only all non-ASCII letters)


Icelandic (is)


Kæmi ný öxi hér ykist bjófum nú bæỗi víl og ádrepa


Sævör grét áðan því úlpan var ónýt
(some ASCII letters missing)


Greek (el)


Γαζέες και μυρτιές δέν θά βρώ στό χρυσαφί ξέφωτο
(= No more shall I see acacias or myrtles in the golden clearing)


Ξεσκεπάζω την ψυχοφθόρα βδελυγμία


page 2 / 3
<<<



TCPDF Example 008
TCPDF by Nicola Asuni - Tecnick.com
www.tcpdf.org


(= I uncover the soul-destroying abhorrence)


Hebrew (iw)


הקליטה איך חברה לו מצא ולפתע מאוכזב בים שט סקרן דג ?


Polish (pl)


Pchnąć w tę łódź jeża lub osiem skrzyń fig
(= To push a hedgehog or eight bins of figs in this boat)


Zażółć gęślą jaźń


Russian (ru)


В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
(= Would a citrus live in the bushes of south? Yes, but only a fake one!)


Thai (th)


[- -]
เป็น มนุษย์ สุดประเสริฐ เลิศ คุณค่า กว่า บรรดา ฝูง สัตว์ เดรัจฉาน
จง ฝ่าฟัน พัฒนา วิชาการ อย่า ล้าง ผลาญ ฤา เข่น ฆ่า บีฑา ใคร
ไม่ ถือ โทษ โกรธ แช่ง ซัด ฮึดฮัด ด่า หัด อภัย เหมือน กีฬา อัชฌาสัย
ปฏิบัติ ประพฤติ กฎ กำหนด ใจ พูดจา ให้ จ๊ะๆ จำๆ น่า ฟัง เอย ฯ


[The copyright for the Thai example is owned by The Computer
Association of Thailand under the Royal Patronage of His Majesty the
King.]


Please let me know if you find others! Special thanks to the people
from all over the world who contributed these sentences.


page 3 / 3
<<<

Loading
Loading