Skip to content

Commit 2a0d17b

Browse files
Changes introduce new mode parameter (#103)
* Changes introduce new mode parameter * Updated the SDK version * Updated the SDK version * Update src/unstract/sdk/__init__.py Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Rahul Johny <[email protected]> * Updated the SDK version * Updated the comments --------- Signed-off-by: Rahul Johny <[email protected]> Co-authored-by: Chandrasekharan M <[email protected]>
1 parent 84f4985 commit 2a0d17b

File tree

3 files changed

+60
-33
lines changed

3 files changed

+60
-33
lines changed

src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ class ProcessingModes(Enum):
77
TEXT = "text"
88

99

10+
class Modes(Enum):
11+
NATIVE_TEXT = "native_text"
12+
LOW_COST = "low_cost"
13+
HIGH_QUALITY = "high_quality"
14+
FORM = "form"
15+
16+
1017
class OutputModes(Enum):
1118
LINE_PRINTER = "line-printer"
1219
DUMP_TEXT = "dump-text"
@@ -52,6 +59,7 @@ class WhispererConfig:
5259

5360
URL = "url"
5461
PROCESSING_MODE = "processing_mode"
62+
MODE = "mode"
5563
OUTPUT_MODE = "output_mode"
5664
UNSTRACT_KEY = "unstract_key"
5765
MEDIAN_FILTER_SIZE = "median_filter_size"

src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,10 @@ def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]:
145145
WhispererConfig.PROCESSING_MODE: self.config.get(
146146
WhispererConfig.PROCESSING_MODE, ProcessingModes.TEXT.value
147147
),
148+
# Not providing default value to maintain legacy compatablity
149+
# Providing default value will overide the params
150+
# processing_mode, force_text_processing
151+
WhispererConfig.MODE: self.config.get(WhispererConfig.MODE),
148152
WhispererConfig.OUTPUT_MODE: self.config.get(
149153
WhispererConfig.OUTPUT_MODE, OutputModes.LINE_PRINTER.value
150154
),

src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json

Lines changed: 48 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,17 @@
2626
"format": "password",
2727
"description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)"
2828
},
29-
"processing_mode": {
29+
"mode": {
3030
"type": "string",
31-
"title": "Processing Mode",
31+
"title": "Mode",
3232
"enum": [
33-
"text",
34-
"ocr"
33+
"native_text",
34+
"low_cost",
35+
"high_quality",
36+
"form"
3537
],
36-
"default": "text",
37-
"description": "Text mode tries to extract text from PDF and falls to OCR if the PDF is a scanned image PDF. This should be your default selection. Use OCR mode if you want to force OCR to extract text. This could be useful if you are dealing with malformed PDFs."
38-
},
39-
"force_text_processing": {
40-
"type": "boolean",
41-
"title": "Force Text Processing",
42-
"default": false,
43-
"description": "If checked, ensures that only text processing runs and there is no OCR involved. This differs from the default behaviour where we fall back to OCR processing in case of failures with text processing."
38+
"default": "form",
39+
"description": "Native text : Extracts text from PDF without OCR. This is very fast and cost effective. Use this mode if you are sure all your PDFs are native text pdfs (not scanned documents). Note that some scanned PDFs are \"searchable\" PDFs. Use the OCR modes for these PDFs as the quality of text in these documents are often poor. \n Low cost : Extracts text from scanned and native PDFs, images and office documents. This OCR mode cannot handle handwriting and low quality scanned pdfs and images. \n High quality : Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. \n Form: Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. Can also extract information about checkboxes and radio button"
4440
},
4541
"output_mode": {
4642
"type": "string",
@@ -53,18 +49,7 @@
5349
"default": "line-printer",
5450
"description": "The output format. Valid options are line-printer, dump-text and text. The line-printer mode tries to maintain the layout of the original text and works very well as inputs to LLMs. dump-text just dumps each page as paragraphs. text extracts text into groups as it sees in the original page. text and dump-text are treated as same in ocr processing mode."
5551
},
56-
"median_filter_size": {
57-
"type": "integer",
58-
"title": "Median Filter Size",
59-
"default": 0,
60-
"description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
61-
},
62-
"gaussian_blur_radius": {
63-
"type": "number",
64-
"title": "Gaussian Blur Radius",
65-
"default": 0.0,
66-
"description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
67-
},
52+
6853
"line_splitter_tolerance": {
6954
"type": "number",
7055
"title": "Line Splitter Tolerance",
@@ -92,18 +77,48 @@
9277
}
9378
},
9479
"if": {
95-
"properties": {
96-
"force_text_processing": {
97-
"const": "false"
80+
"anyOf": [
81+
{
82+
"properties": {
83+
"mode": {
84+
"const": "low_cost"
85+
}
86+
}
87+
},
88+
{
89+
"properties": {
90+
"mode": {
91+
"const": "high_quality"
92+
}
93+
}
94+
},
95+
{
96+
"properties": {
97+
"mode": {
98+
"const": "form"
99+
}
100+
}
98101
}
99-
}
102+
]
100103
},
101104
"then": {
102105
"properties": {
103-
"required": [
104-
"median_filter_size",
105-
"gaussian_blur_radius"
106-
]
107-
}
106+
"median_filter_size": {
107+
"type": "integer",
108+
"title": "Median Filter Size",
109+
"default": 0,
110+
"description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
111+
},
112+
"gaussian_blur_radius": {
113+
"type": "number",
114+
"title": "Gaussian Blur Radius",
115+
"default": 0.0,
116+
"description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
117+
}
118+
},
119+
"required": [
120+
"median_filter_size",
121+
"gaussian_blur_radius"
122+
]
108123
}
109124
}

0 commit comments

Comments
 (0)