Skip to content

Commit 983eef5

Browse files
Index and LLM fixes (#36)
* Bypass x2text for text file * Regex fix for JSON * Version bump
1 parent f2c9bf1 commit 983eef5

File tree

5 files changed

+20
-16
lines changed

5 files changed

+20
-16
lines changed

pdm.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ dependencies = [
1212
"python-magic~=0.4.27",
1313
"python-dotenv==1.0.0",
1414
# LLM Triad
15-
"unstract-adapters~=0.9.0",
15+
"unstract-adapters~=0.10.0",
1616
"llama-index==0.10.28",
1717
"tiktoken~=0.4.0",
1818
"transformers==4.37.0",

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.19.0"
1+
__version__ = "0.20.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/index.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,13 +236,17 @@ def index_file(
236236
full_text = []
237237
extracted_text = ""
238238
try:
239-
x2text = X2Text(tool=self.tool)
240-
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
241-
adapter_instance_id=x2text_adapter
242-
)
243-
extracted_text = x2text_adapter_inst.process(
244-
input_file_path=file_path, output_file_path=output_file_path
245-
)
239+
if not output_file_path:
240+
with open(file_path, encoding="utf-8") as file:
241+
extracted_text = file.read()
242+
else:
243+
x2text = X2Text(tool=self.tool)
244+
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
245+
adapter_instance_id=x2text_adapter
246+
)
247+
extracted_text = x2text_adapter_inst.process(
248+
input_file_path=file_path, output_file_path=output_file_path
249+
)
246250
except AdapterError as e:
247251
# Wrapping AdapterErrors with SdkError
248252
raise IndexingError(str(e)) from e

src/unstract/sdk/llm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
class ToolLLM:
2323
"""Class to handle LLMs for Unstract Tools."""
2424

25-
code_block_regex = re.compile(r"```.*?\n(.*?)\n```", re.DOTALL)
25+
json_regex = re.compile(r"\{(?:.|\n)*\}")
2626

2727
def __init__(
2828
self,
@@ -62,9 +62,9 @@ def run_completion(
6262
for i in range(retries):
6363
try:
6464
response: CompletionResponse = llm.complete(prompt, **kwargs)
65-
match = cls.code_block_regex.search(response.text)
65+
match = cls.json_regex.search(response.text)
6666
if match:
67-
response.text = match.group(1)
67+
response.text = match.group(0)
6868

6969
usage = {}
7070
llm_token_counts = llm.callback_manager.handlers[

0 commit comments

Comments
 (0)