URL in post support and test case (#13)

nagesh-zip · chandrasekharan-zipstack · web-flow · commit f031453b655f · 2024-10-30T17:30:06.000+05:30
* added url in post logic and test using url

* added common function for extracted text assertion

* renamed assert function

---------

Signed-off-by: Chandrasekharan M &lt;117059509+chandrasekharan-zipstack@users.noreply.github.com&gt;
Co-authored-by: Chandrasekharan M &lt;117059509+chandrasekharan-zipstack@users.noreply.github.com&gt;
diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py
@@ -152,13 +152,13 @@ def whisper(
         file_path: str = "",
         stream: IO[bytes] = None,
         url: str = "",
-        mode: str = "high_quality",
+        mode: str = "form",
         output_mode: str = "layout_preserving",
         page_seperator: str = "<<<",
         pages_to_extract: str = "",
         median_filter_size: int = 0,
         gaussian_blur_radius: int = 0,
-        line_splitter_tolerance: float = 0.75,
+        line_splitter_tolerance: float = 0.4,
         horizontal_stretch_factor: float = 1.0,
         mark_vertical_lines: bool = False,
         mark_horizontal_lines: bool = False,
@@ -216,7 +216,6 @@ def whisper(
         self.logger.debug("whisper called")
         api_url = f"{self.base_url}/whisper"
         params = {
-            "url": url,
             "mode": mode,
             "output_mode": output_mode,
             "page_seperator": page_seperator,
@@ -281,7 +280,8 @@ def generate():
                     data=data,
                 )
         else:
-            req = requests.Request("POST", api_url, params=params, headers=self.headers)
+            params["url_in_post"] = True
+            req = requests.Request("POST", api_url, params=params, headers=self.headers, data=url)
         prepared = req.prepare()
         s = requests.Session()
         response = s.send(prepared, timeout=wait_timeout, stream=should_stream)
@@ -350,7 +350,7 @@ def generate():
             return message
 
         # Will not reach here if status code is 202
-        message = response.text
+        message = json.loads(response.text)
         message["status_code"] = response.status_code
         return message
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,10 +1,13 @@
 import os
 
 import pytest
+from dotenv import load_dotenv
 
 from unstract.llmwhisperer.client import LLMWhispererClient
 from unstract.llmwhisperer.client_v2 import LLMWhispererClientV2
 
+load_dotenv()
+
 
 @pytest.fixture(name="client")
 def llm_whisperer_client():
diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py
@@ -50,23 +50,75 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
 
     exp_basename = f"{Path(input_file).stem}.{mode}.{output_mode}.txt"
     exp_file = os.path.join(data_dir, "expected", exp_basename)
-    with open(exp_file, encoding="utf-8") as f:
+    # verify extracted text
+    assert_extracted_text(exp_file, whisper_result, mode, output_mode)
+
+
+@pytest.mark.parametrize(
+    "output_mode, mode, url, input_file, page_count",
+    [
+        ("layout_preserving", "native_text", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
+         "credit_card.pdf", 7),
+        ("layout_preserving", "low_cost", "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf",
+         "credit_card.pdf", 7),
+        ("layout_preserving", "high_quality", "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf",
+         "restaurant_invoice_photo.pdf", 1),
+        ("layout_preserving", "form", "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf",
+         "handwritten-form.pdf", 1),
+    ]
+)
+def test_whisper_v2_url_in_post(client_v2, data_dir, output_mode, mode, url, input_file, page_count):
+    usage_before = client_v2.get_usage_info()
+    whisper_result = client_v2.whisper(
+        mode=mode, output_mode=output_mode, url=url, wait_for_completion=True
+    )
+    logger.debug(f"Result for '{output_mode}', '{mode}', " f"'{input_file}: {whisper_result}")
+
+    exp_basename = f"{Path(input_file).stem}.{mode}.{output_mode}.txt"
+    exp_file = os.path.join(data_dir, "expected", exp_basename)
+    # verify extracted text
+    assert_extracted_text(exp_file, whisper_result, mode, output_mode)
+    usage_after = client_v2.get_usage_info()
+    # Verify usage after extraction
+    verify_usage(usage_before, usage_after, page_count, mode)
+
+
+def assert_extracted_text(file_path, whisper_result, mode, output_mode):
+    with open(file_path, encoding="utf-8") as f:
         exp = f.read()
 
     assert isinstance(whisper_result, dict)
     assert whisper_result["status_code"] == 200
 
-    # For text based processing, perform a strict match
+    # For OCR based processing
+    threshold = 0.97
+
+    # For text based processing
     if mode == "native_text" and output_mode == "text":
-        assert whisper_result["extraction"]["result_text"] == exp
-    # For OCR based processing, perform a fuzzy match
-    else:
-        extracted_text = whisper_result["extraction"]["result_text"]
-        similarity = SequenceMatcher(None, extracted_text, exp).ratio()
-        threshold = 0.97
-
-        if similarity < threshold:
-            diff = "\n".join(
-                unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
-            )
-            pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")
+        threshold = 0.99
+    extracted_text = whisper_result["extraction"]["result_text"]
+    similarity = SequenceMatcher(None, extracted_text, exp).ratio()
+
+    if similarity < threshold:
+        diff = "\n".join(
+            unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
+        )
+        pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")
+
+
+def verify_usage(before_extract, after_extract, page_count, mode='form'):
+    all_modes = ['form', 'high_quality', 'low_cost', 'native_text']
+    all_modes.remove(mode)
+    assert (after_extract['today_page_count'] == before_extract['today_page_count'] + page_count), \
+        "today_page_count calculation is wrong"
+    assert (after_extract['current_page_count'] == before_extract['current_page_count'] + page_count), \
+        "current_page_count calculation is wrong"
+    if after_extract['overage_page_count'] > 0:
+        assert (after_extract['overage_page_count'] == before_extract['overage_page_count'] + page_count), \
+            "overage_page_count calculation is wrong"
+    assert (after_extract[f'current_page_count_{mode}'] == before_extract[f'current_page_count_{mode}'] + page_count), \
+        f"{mode} mode calculation is wrong"
+    for i in range(len(all_modes)):
+        assert (after_extract[f'current_page_count_{all_modes[i]}'] ==
+                before_extract[f'current_page_count_{all_modes[i]}']), \
+            f"{all_modes[i]} mode calculation is wrong"
diff --git a/tests/test_data/expected/credit_card.low_cost.layout_preserving.txt b/tests/test_data/expected/credit_card.low_cost.layout_preserving.txt