Changes introduce new mode parameter (#103)

johnyrahul · chandrasekharan-zipstack · web-flow · commit 2a0d17bb6ddf · 2024-09-23T13:42:50.000+05:30
* Changes introduce new mode parameter

* Updated the SDK version

* Updated the SDK version

* Update src/unstract/sdk/__init__.py

Co-authored-by: Chandrasekharan M &lt;117059509+chandrasekharan-zipstack@users.noreply.github.com&gt;
Signed-off-by: Rahul Johny &lt;116638720+johnyrahul@users.noreply.github.com&gt;

* Updated the SDK version

* Updated the comments

---------

Signed-off-by: Rahul Johny &lt;116638720+johnyrahul@users.noreply.github.com&gt;
Co-authored-by: Chandrasekharan M &lt;117059509+chandrasekharan-zipstack@users.noreply.github.com&gt;
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py
@@ -7,6 +7,13 @@ class ProcessingModes(Enum):
     TEXT = "text"
 
 
+class Modes(Enum):
+    NATIVE_TEXT = "native_text"
+    LOW_COST = "low_cost"
+    HIGH_QUALITY = "high_quality"
+    FORM = "form"
+
+
 class OutputModes(Enum):
     LINE_PRINTER = "line-printer"
     DUMP_TEXT = "dump-text"
@@ -52,6 +59,7 @@ class WhispererConfig:
 
     URL = "url"
     PROCESSING_MODE = "processing_mode"
+    MODE = "mode"
     OUTPUT_MODE = "output_mode"
     UNSTRACT_KEY = "unstract_key"
     MEDIAN_FILTER_SIZE = "median_filter_size"
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py
@@ -145,6 +145,10 @@ def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]:
             WhispererConfig.PROCESSING_MODE: self.config.get(
                 WhispererConfig.PROCESSING_MODE, ProcessingModes.TEXT.value
             ),
+            # Not providing default value to maintain legacy compatablity
+            # Providing default value will overide the params
+            # processing_mode, force_text_processing
+            WhispererConfig.MODE: self.config.get(WhispererConfig.MODE),
             WhispererConfig.OUTPUT_MODE: self.config.get(
                 WhispererConfig.OUTPUT_MODE, OutputModes.LINE_PRINTER.value
             ),
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json
@@ -26,21 +26,17 @@
       "format": "password",
       "description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)"
     },
-    "processing_mode": {
+    "mode": {
       "type": "string",
-      "title": "Processing Mode",
+      "title": "Mode",
       "enum": [
-        "text",
-        "ocr"
+        "native_text",
+        "low_cost",
+        "high_quality",
+        "form"
       ],
-      "default": "text",
-      "description": "Text mode tries to extract text from PDF and falls to OCR if the PDF is a scanned image PDF. This should be your default selection. Use OCR mode if you want to force OCR to extract text. This could be useful if you are dealing with malformed PDFs."
-    },
-    "force_text_processing": {
-      "type": "boolean",
-      "title": "Force Text Processing",
-      "default": false,
-      "description": "If checked, ensures that only text processing runs and there is no OCR involved. This differs from the default behaviour where we fall back to OCR processing in case of failures with text processing."
+      "default": "form",
+      "description": "Native text : Extracts text from PDF without OCR. This is very fast and cost effective. Use this mode if you are sure all your PDFs are native text pdfs (not scanned documents). Note that some scanned PDFs are \"searchable\" PDFs. Use the OCR modes for these PDFs as the quality of text in these documents are often poor. \n Low cost : Extracts text from scanned and native PDFs, images and office documents. This OCR mode cannot handle handwriting and low quality scanned pdfs and images. \n High quality : Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. \n Form:  Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. Can also extract information about checkboxes and radio button"
     },
     "output_mode": {
       "type": "string",
@@ -53,18 +49,7 @@
       "default": "line-printer",
       "description": "The output format. Valid options are line-printer, dump-text and text. The line-printer mode tries to maintain the layout of the original text and works very well as inputs to LLMs. dump-text just dumps each page as paragraphs. text extracts text into groups as it sees in the original page. text and dump-text are treated as same in ocr processing mode."
     },
-    "median_filter_size": {
-      "type": "integer",
-      "title": "Median Filter Size",
-      "default": 0,
-      "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
-    },
-    "gaussian_blur_radius": {
-      "type": "number",
-      "title": "Gaussian Blur Radius",
-      "default": 0.0,
-      "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
-    },
+
     "line_splitter_tolerance": {
       "type": "number",
       "title": "Line Splitter Tolerance",
@@ -92,18 +77,48 @@
     }
   },
   "if": {
-    "properties": {
-      "force_text_processing": {
-        "const": "false"
+    "anyOf": [
+      {
+        "properties": {
+          "mode": {
+            "const": "low_cost"
+          }
+        }
+      },
+      {
+        "properties": {
+          "mode": {
+            "const": "high_quality"
+          }
+        }
+      },
+      {
+        "properties": {
+          "mode": {
+            "const": "form"
+          }
+        }
       }
-    }
+    ]
   },
   "then": {
     "properties": {
-      "required": [
-        "median_filter_size",
-        "gaussian_blur_radius"
-      ]
-    }
+      "median_filter_size": {
+        "type": "integer",
+        "title": "Median Filter Size",
+        "default": 0,
+        "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
+      },
+      "gaussian_blur_radius": {
+        "type": "number",
+        "title": "Gaussian Blur Radius",
+        "default": 0.0,
+        "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
+      }
+    },
+    "required": [
+      "median_filter_size",
+      "gaussian_blur_radius"
+    ]
   }
 }