diff --git a/src/unstract/llmwhisperer/__init__.py b/src/unstract/llmwhisperer/__init__.py index b37c8cc..fa12804 100644 --- a/src/unstract/llmwhisperer/__init__.py +++ b/src/unstract/llmwhisperer/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.1.0" +__version__ = "2.2.0" from .client import LLMWhispererClient # noqa: F401 from .client_v2 import LLMWhispererClientV2 # noqa: F401 diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py index 68acfc3..3592e5d 100644 --- a/src/unstract/llmwhisperer/client_v2.py +++ b/src/unstract/llmwhisperer/client_v2.py @@ -155,6 +155,44 @@ def get_usage_info(self) -> dict: raise LLMWhispererClientException(err) return json.loads(response.text) + def get_highlight_data(self, whisper_hash: str, lines: str, extract_all_lines: bool = False) -> dict: + """Retrieves the highlight information of the LLMWhisperer API. + + This method sends a GET request to the '/highlights' endpoint of the LLMWhisperer API. + The response is a JSON object containing the usage information. + Refer to https://docs.unstract.com/llm_whisperer/apis/llm_whisperer_usage_api + + Args: + whisper_hash (str): The hash of the whisper operation. + lines (str): Define which lines metadata to retrieve. + You can specify which lines metadata to retrieve with this parameter. + Example 1-5,7,21- will retrieve lines metadata 1,2,3,4,5,7,21,22,23,24... + till the last line meta data. + Returns: + dict: A dictionary containing the highlight information. + + Raises: + LLMWhispererClientException: If the API request fails, it raises an exception with + the error message and status code returned by the API. + """ + self.logger.debug("highlight called") + url = f"{self.base_url}/highlights" + params = { + "whisper_hash": whisper_hash, + "lines": lines, + "extract_all_lines": extract_all_lines, + } + self.logger.debug("url: %s", url) + req = requests.Request("GET", url, headers=self.headers, params=params) + prepared = req.prepare() + s = requests.Session() + response = s.send(prepared, timeout=self.api_timeout) + if response.status_code != 200: + err = json.loads(response.text) + err["status_code"] = response.status_code + raise LLMWhispererClientException(err) + return json.loads(response.text) + def whisper( self, file_path: str = "", @@ -171,6 +209,7 @@ def whisper( mark_vertical_lines: bool = False, mark_horizontal_lines: bool = False, line_spitter_strategy: str = "left-priority", + add_line_nos: bool = False, lang="eng", tag="default", filename="", @@ -201,6 +240,8 @@ def whisper( mark_vertical_lines (bool, optional): Whether to mark vertical lines. Defaults to False. mark_horizontal_lines (bool, optional): Whether to mark horizontal lines. Defaults to False. line_spitter_strategy (str, optional): The line splitter strategy. Defaults to "left-priority". + add_line_nos (bool, optional): Adds line numbers to the extracted text and saves line metadata, + which can be queried later using the highlights API. lang (str, optional): The language of the document. Defaults to "eng". tag (str, optional): The tag for the document. Defaults to "default". filename (str, optional): The name of the file to store in reports. Defaults to "". @@ -235,6 +276,7 @@ def whisper( "mark_vertical_lines": mark_vertical_lines, "mark_horizontal_lines": mark_horizontal_lines, "line_spitter_strategy": line_spitter_strategy, + "add_line_nos": add_line_nos, "lang": lang, "tag": tag, "filename": filename, diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py index f56ba3f..ed532ce 100644 --- a/tests/integration/client_v2_test.py +++ b/tests/integration/client_v2_test.py @@ -78,6 +78,49 @@ def test_whisper_v2_error(client_v2, data_dir, output_mode, mode, input_file): assert_error_message(whisper_result) +@pytest.mark.parametrize( + "input_file", + [ + ("credit_card.pdf"), + ], +) +def test_highlight(client_v2, data_dir, input_file): + file_path = os.path.join(data_dir, input_file) + + whisper_result = client_v2.whisper( + add_line_nos=True, + file_path=file_path, + wait_for_completion=True, + ) + whisper_hash = whisper_result["whisper_hash"] + highlight_data = client_v2.get_highlight_data(whisper_hash=whisper_hash, lines="1-2") + + # Assert the structure and content of highlight_data + assert isinstance(highlight_data, dict) + assert len(highlight_data) == 2 + assert "1" in highlight_data + assert "2" in highlight_data + + # Assert line 1 data + line1 = highlight_data["1"] + assert line1["base_y"] == 0 + assert line1["base_y_percent"] == 0 + assert line1["height"] == 0 + assert line1["height_percent"] == 0 + assert line1["page"] == 0 + assert line1["page_height"] == 0 + assert line1["raw"] == [0, 0, 0, 0] + + # Assert line 2 data + line2 = highlight_data["2"] + assert line2["base_y"] == 155 + assert line2["base_y_percent"] == pytest.approx(4.8927) # Using approx for float comparison + assert line2["height"] == 51 + assert line2["height_percent"] == pytest.approx(1.6098) # Using approx for float comparison + assert line2["page"] == 0 + assert line2["page_height"] == 3168 + + @pytest.mark.parametrize( "output_mode, mode, url, input_file, page_count", [