Skip to content

Commit 6c1178d

Browse files
Feat/dynamic page number llm whisperer (#82)
* adding dynamic page loading * adding page separator llm-whisperer * adding regex pattern * removing pattern * PR changes * upgrade sdk version
1 parent e2a2595 commit 6c1178d

File tree

4 files changed

+17
-2
lines changed

4 files changed

+17
-2
lines changed

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.41.1"
1+
__version__ = "0.42.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/adapters/x2text/llm_whisperer/src/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class WhispererConfig:
6161
HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor"
6262
PAGES_TO_EXTRACT = "pages_to_extract"
6363
STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting"
64+
PAGE_SEPARATOR = "page_seperator"
6465

6566

6667
class WhisperStatus:
@@ -86,3 +87,4 @@ class WhispererDefaults:
8687
POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30))
8788
MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30))
8889
PAGES_TO_EXTRACT = ""
90+
PAGE_SEPARATOR = "<<< >>>"

src/unstract/sdk/adapters/x2text/llm_whisperer/src/llm_whisperer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
from unstract.sdk.adapters.exceptions import ExtractorError
1111
from unstract.sdk.adapters.utils import AdapterUtils
1212
from unstract.sdk.adapters.x2text.constants import X2TextConstants
13-
from unstract.sdk.adapters.x2text.dto import TextExtractionMetadata, TextExtractionResult
13+
from unstract.sdk.adapters.x2text.dto import (
14+
TextExtractionMetadata,
15+
TextExtractionResult,
16+
)
1417
from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import (
1518
HTTPMethod,
1619
OutputModes,
@@ -159,6 +162,10 @@ def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]:
159162
WhispererConfig.PAGES_TO_EXTRACT,
160163
WhispererDefaults.PAGES_TO_EXTRACT,
161164
),
165+
WhispererConfig.PAGE_SEPARATOR: self.config.get(
166+
WhispererConfig.PAGE_SEPARATOR,
167+
WhispererDefaults.PAGE_SEPARATOR,
168+
),
162169
}
163170
if not params[WhispererConfig.FORCE_TEXT_PROCESSING]:
164171
params.update(

src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,12 @@
8383
"default": "",
8484
"pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$",
8585
"description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages."
86+
},
87+
"page_seperator": {
88+
"type": "string",
89+
"title": "Page separator",
90+
"default": "<<< >>>",
91+
"description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator."
8692
}
8793
},
8894
"if": {

0 commit comments

Comments
 (0)