[FEAT] Adapter for LLMWhisperer v2 (#110)

harini-venkataraman · gaya3-zipstack · hari-kuriakose · web-flow · commit df73510f0335 · 2024-10-17T17:10:38.000+05:30
* Exception handling for Prompt Service

* LLM Whisperer adapter v2

* Support for LLMWHisperer v2 adapter

* Marked v1 as deprecated

* Marked v1 as deprecated

* Update json_schema.json

Signed-off-by: Jaseem Jas &lt;89440144+jaseemjaskp@users.noreply.github.com&gt;

* Minor code standization changes

* Refactor exception handling

* Adding dev comments

---------

Signed-off-by: Jaseem Jas &lt;89440144+jaseemjaskp@users.noreply.github.com&gt;
Co-authored-by: Gayathri &lt;142381512+gaya3-zipstack@users.noreply.github.com&gt;
Co-authored-by: Hari John Kuriakose &lt;hari@zipstack.com&gt;
Co-authored-by: Jaseem Jas &lt;89440144+jaseemjaskp@users.noreply.github.com&gt;
diff --git a/src/unstract/sdk/adapters/x2text/constants.py b/src/unstract/sdk/adapters/x2text/constants.py
@@ -5,3 +5,4 @@ class X2TextConstants:
     ENABLE_HIGHLIGHT = "enable_highlight"
     EXTRACTED_TEXT = "extracted_text"
     WHISPER_HASH = "whisper-hash"
+    WHISPER_HASH_V2 = "whisper_hash"
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
@@ -18,7 +18,7 @@
       "title": "URL",
       "format": "uri",
       "default": "https://llmwhisperer-api.unstract.com",
-      "description": "Provide the URL of the LLM Whisperer service."
+      "description": "Provide the URL of the LLM Whisperer service. Please note that this version of LLM Whisperer is deprecated."
     },
     "unstract_key": {
       "type": "string",
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md
@@ -0,0 +1,58 @@
+# Unstract LLM Whisperer v2 X2Text Adapter
+
+## Env variables
+
+The below env variables are resolved by LLM Whisperer adapter
+
+| Variable                     | Description                                                                                  |
+| ---------------------------- | -------------------------------------------------------------------------------------------- |
+| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s            |
+| `ADAPTER_LLMW_MAX_POLLS`     | Total number of times to poll the status API. Defaults to 30                                 |
+
+
+---
+id: llm_whisperer_apis_changelog
+---
+
+# Changelog
+
+## Version 2.0.0
+
+:::warning
+This version of the API is not backward compatible with the previous version.
+:::
+
+### API endpoint
+
+- The base URL for the **V2** APIs is `https://llmwhisperer-api.unstract.com/api/v2`
+
+### Global change in parameter naming
+
+- All use of `whisper-hash` as a parameter has been replaced with `whisper_hash` for consistency. 
+
+### Whisper parameters
+
+#### Added
+- `mode` (str, optional): The processing mode. 
+- `mark_vertical_lines` (bool, optional): Whether to reproduce vertical lines in the document.
+- `mark_horizontal_lines` (bool, optional): Whether to reproduce horizontal lines in the document. 
+- `line_splitter_strategy` (str, optional): The line splitter strategy to use. An advanced option for customizing the line splitting process. 
+- `lang` (str, optional): The language of the document. 
+- `tag` (str, optional): A tag to associate with the document. Used for auditing and tracking purposes.
+- `file_name` (str, optional): The name of the file being processed. Used for auditing and tracking purposes.
+- `use_webhook` (str, optional): The name of the webhook to call after the document is processed.
+- `webhook_metadata` (str, optional): Metadata to send to the webhook after the document is processed.
+
+#### Removed
+- `timeout` (int, optional): The timeout for API requests. *There is no sync mode now. All requests are async.*
+- `force_text_processing` (bool, optional): Whether to force text processing. *This is feature is removed*
+- `ocr_provider` (str, optional): The OCR provider to use. *This is superseded by `mode`*
+- `processing_mode` (str, optional): The processing mode. *This is superseded by `mode`*
+- `store_metadata_for_highlighting` (bool, optional): Whether to store metadata for highlighting. *Feature is removed. Data still available and set back when retrieve is called*
+
+
+### New features
+
+#### Webhooks
+
+- Added support for webhooks. You can now register a webhook and use it to receive the processed document.
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+
+[project]
+name = "unstract-llm_whisperer-x2text-v2"
+version = "0.0.1"
+description = "V2 of LLMWhisperer X2Text Adapter"
+authors = [
+    {name = "Zipstack Inc.", email = "devsupport@zipstack.com"},
+]
+dependencies = [
+]
+requires-python = ">=3.9"
+readme = "README.md"
+classifiers = [
+  "Programming Language :: Python"
+]
+license = {text = "MIT"}
+
+[tool.pdm.build]
+includes = ["src"]
+package-dir = "src"
+# source-includes = ["tests"]
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py
@@ -0,0 +1,9 @@
+from .llm_whisperer_v2 import LLMWhispererV2
+
+metadata = {
+    "name": LLMWhispererV2.__name__,
+    "version": "1.0.0",
+    "adapter": LLMWhispererV2,
+    "description": "LLMWhispererV2 X2Text adapter",
+    "is_active": True,
+}
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py
@@ -0,0 +1,103 @@
+import os
+from enum import Enum
+
+
+class Modes(Enum):
+    NATIVE_TEXT = "native_text"
+    LOW_COST = "low_cost"
+    HIGH_QUALITY = "high_quality"
+    FORM = "form"
+
+
+class OutputModes(Enum):
+    LAYOUT_PRESERVING = "layout_preserving"
+    TEXT = "text"
+
+
+class HTTPMethod(Enum):
+    GET = "GET"
+    POST = "POST"
+
+
+class WhispererHeader:
+    UNSTRACT_KEY = "unstract-key"
+
+
+class WhispererEndpoint:
+    """Endpoints available at LLMWhisperer service."""
+
+    TEST_CONNECTION = "test-connection"
+    WHISPER = "whisper"
+    STATUS = "whisper-status"
+    RETRIEVE = "whisper-retrieve"
+
+
+class WhispererEnv:
+    """Env variables for LLM whisperer.
+
+    Can be used to alter behaviour at runtime.
+
+    Attributes:
+        POLL_INTERVAL: Time in seconds to wait before polling
+            LLMWhisperer's status API. Defaults to 30s
+        MAX_POLLS: Total number of times to poll the status API.
+            Set to -1 to poll indefinitely. Defaults to -1
+    """
+
+    POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL"
+    MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS"
+
+
+class WhispererConfig:
+    """Dictionary keys used to configure LLMWhisperer service."""
+
+    URL = "url"
+    MODE = "mode"
+    OUTPUT_MODE = "output_mode"
+    UNSTRACT_KEY = "unstract_key"
+    MEDIAN_FILTER_SIZE = "median_filter_size"
+    GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius"
+    LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance"
+    LINE_SPLITTER_STRATEGY = "line_splitter_strategy"
+    HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor"
+    PAGES_TO_EXTRACT = "pages_to_extract"
+    MARK_VERTICAL_LINES = "mark_vertical_lines"
+    MARK_HORIZONTAL_LINES = "mark_horizontal_lines"
+    PAGE_SEPARATOR = "page_seperator"
+    URL_IN_POST = "url_in_post"
+    TAG = "tag"
+    USE_WEBHOOK = "use_webhook"
+    WEBHOOK_METADATA = "webhook_metadata"
+    TEXT_ONLY = "text_only"
+
+
+class WhisperStatus:
+    """Values returned / used by /whisper-status endpoint."""
+
+    PROCESSING = "processing"
+    PROCESSED = "processed"
+    DELIVERED = "delivered"
+    UNKNOWN = "unknown"
+    # Used for async processing
+    WHISPER_HASH = "whisper_hash"
+    STATUS = "status"
+
+
+class WhispererDefaults:
+    """Defaults meant for LLM whisperer."""
+
+    MEDIAN_FILTER_SIZE = 0
+    GAUSSIAN_BLUR_RADIUS = 0.0
+    FORCE_TEXT_PROCESSING = False
+    LINE_SPLITTER_TOLERANCE = 0.75
+    LINE_SPLITTER_STRATEGY = "left-priority"
+    HORIZONTAL_STRETCH_FACTOR = 1.0
+    POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30))
+    MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30))
+    PAGES_TO_EXTRACT = ""
+    PAGE_SEPARATOR = "<<<"
+    MARK_VERTICAL_LINES = False
+    MARK_HORIZONTAL_LINES = False
+    URL_IN_POST = False
+    TAG = "default"
+    TEXT_ONLY = False
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json