added url in post logic and test using url

nagesh-zip · nagesh-zip · commit 889af42f5224 · 2024-10-28T18:04:11.000+05:30
diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py
@@ -151,13 +151,13 @@ def whisper(
         file_path: str = "",
         stream: IO[bytes] = None,
         url: str = "",
-        mode: str = "high_quality",
+        mode: str = "form",
         output_mode: str = "layout_preserving",
         page_seperator: str = "<<<",
         pages_to_extract: str = "",
         median_filter_size: int = 0,
         gaussian_blur_radius: int = 0,
-        line_splitter_tolerance: float = 0.75,
+        line_splitter_tolerance: float = 0.4,
         horizontal_stretch_factor: float = 1.0,
         mark_vertical_lines: bool = False,
         mark_horizontal_lines: bool = False,
@@ -178,7 +178,7 @@ def whisper(
             file_path (str, optional): The path to the file to be processed. Defaults to "".
             stream (IO[bytes], optional): A stream of bytes to be processed. Defaults to None.
             url (str, optional): The URL of the file to be processed. Defaults to "".
-            mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text". Defaults to "high_quality".
+            mode (str, optional): The processing mode. Can be "high_quality", "form", "low_cost" or "native_text". Defaults to "form".
             output_mode (str, optional): The output mode. Can be "layout_preserving" or "text". Defaults to "layout_preserving".
             page_seperator (str, optional): The page separator. Defaults to "<<<".
             pages_to_extract (str, optional): The pages to extract. Defaults to "".
@@ -207,7 +207,6 @@ def whisper(
         self.logger.debug("whisper called")
         api_url = f"{self.base_url}/whisper"
         params = {
-            "url": url,
             "mode": mode,
             "output_mode": output_mode,
             "page_seperator": page_seperator,
@@ -272,7 +271,8 @@ def generate():
                     data=data,
                 )
         else:
-            req = requests.Request("POST", api_url, params=params, headers=self.headers)
+            params["url_in_post"] = True
+            req = requests.Request("POST", api_url, params=params, headers=self.headers, data=url)
         prepared = req.prepare()
         s = requests.Session()
         response = s.send(prepared, timeout=120, stream=should_stream)
@@ -340,7 +340,7 @@ def generate():
             return message
 
         # Will not reach here if status code is 202
-        message = response.text
+        message = json.loads(response.text)
         message["status_code"] = response.status_code
         return message
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,10 +1,13 @@
 import os
 
 import pytest
+from dotenv import load_dotenv
 
 from unstract.llmwhisperer.client import LLMWhispererClient
 from unstract.llmwhisperer.client_v2 import LLMWhispererClientV2
 
+load_dotenv()
+
 
 @pytest.fixture(name="client")
 def llm_whisperer_client():
diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py
@@ -69,3 +69,72 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
                 unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
             )
             pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")
+
+
+@pytest.mark.parametrize(
+    "output_mode, mode, url, input_file, page_count",
+    [
+        ("layout_preserving", "native_text", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
+         "credit_card.pdf", 7),
+        ("layout_preserving", "low_cost", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
+         "credit_card.pdf", 7),
+        (
+                "layout_preserving", "high_quality",
+                "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf",
+                "restaurant_invoice_photo.pdf", 1),
+        ("layout_preserving", "form", "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf",
+         "handwritten-form.pdf", 1),
+    ]
+)
+def test_whisper_v2_url_in_post(client_v2, data_dir, output_mode, mode, url, input_file, page_count):
+    usage_before = client_v2.get_usage_info()
+    whisper_result = client_v2.whisper(
+        mode=mode, output_mode=output_mode, url=url, wait_for_completion=True
+    )
+    logger.debug(f"Result for '{output_mode}', '{mode}', " f"'{input_file}: {whisper_result}")
+
+    exp_basename = f"{Path(input_file).stem}.{mode}.{output_mode}.txt"
+    exp_file = os.path.join(data_dir, "expected", exp_basename)
+    with open(exp_file, encoding="utf-8") as f:
+        exp = f.read()
+
+    assert isinstance(whisper_result, dict)
+    assert whisper_result["status_code"] == 200
+
+    # For text based processing, perform a strict match
+    if mode == "native_text" and output_mode == "text":
+        assert whisper_result["extraction"]["result_text"] == exp
+    # For OCR based processing, perform a fuzzy match
+    else:
+        extracted_text = whisper_result["extraction"]["result_text"]
+        similarity = SequenceMatcher(None, extracted_text, exp).ratio()
+        threshold = 0.97
+
+        if similarity < threshold:
+            diff = "\n".join(
+                unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
+            )
+            pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")
+
+    usage_after = client_v2.get_usage_info()
+    # Verify usage after extraction
+    verify_usage(usage_before, usage_after, page_count, mode)
+
+
+def verify_usage(before_extract, after_extract, page_count, mode='form'):
+    all_modes = ['form', 'high_quality', 'low_cost', 'native_text']
+    all_modes.remove(mode)
+    assert (after_extract['today_page_count'] == before_extract['today_page_count'] + page_count), \
+        "today_page_count calculation is wrong"
+    if after_extract['current_page_count'] != -1:
+        assert (after_extract['current_page_count'] == before_extract['current_page_count'] + page_count), \
+            "current_page_count calculation is wrong"
+    if after_extract['overage_page_count'] > 0:
+        assert (after_extract['overage_page_count'] == before_extract['overage_page_count'] + page_count), \
+            "overage_page_count calculation is wrong"
+    assert (after_extract[f'current_page_count_{mode}'] == before_extract[f'current_page_count_{mode}'] + page_count), \
+        f"{mode} mode calculation is wrong"
+    for i in range(len(all_modes)):
+        assert (after_extract[f'current_page_count_{all_modes[i]}'] ==
+                before_extract[f'current_page_count_{all_modes[i]}']), \
+            f"{all_modes[i]} mode calculation is wrong"
diff --git a/tests/test_data/expected/credit_card.low_cost.layout_preserving.txt b/tests/test_data/expected/credit_card.low_cost.layout_preserving.txt