Skip to content

Commit 1c23ee6

Browse files
feat: Bumped SDK to 0.13.0 which uses adapters 0.3.0 (#12)
Bumped SDK to 0.13.0 which uses adapters 0.3.0. Minor index file type and docstring fix
1 parent 4952c41 commit 1c23ee6

File tree

5 files changed

+55
-35
lines changed

5 files changed

+55
-35
lines changed

pdm.lock

Lines changed: 26 additions & 26 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies = [
1515
"python-magic~=0.4.27",
1616
"python-dotenv==1.0.0",
1717
# LLM Triad
18-
"unstract-adapters~=0.2.2",
18+
"unstract-adapters~=0.3.0",
1919
"llama-index==0.9.28",
2020
"tiktoken~=0.4.0",
2121
"transformers==4.37.0",

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.12.1"
1+
__version__ = "0.13.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/index.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from typing import Optional
2-
import os
32

43
from llama_index import Document, StorageContext, VectorStoreIndex
54
from llama_index.node_parser import SimpleNodeParser
@@ -106,7 +105,29 @@ def index_file(
106105
reindex: bool = False,
107106
file_hash: Optional[str] = None,
108107
output_file_path: Optional[str] = None,
109-
):
108+
) -> str:
109+
"""Indexes an individual file using the passed arguments.
110+
111+
Args:
112+
tool_id (str): UUID of the tool (workflow_id in case its called
113+
from workflow)
114+
embedding_type (str): UUID of the embedding service configured
115+
vector_db (str): UUID of the vector DB configured
116+
x2text_adapter (str): UUID of the x2text adapter configured.
117+
This is to extract text from documents.
118+
file_path (str): Path to the file that needs to be indexed.
119+
chunk_size (int): Chunk size to be used for indexing
120+
chunk_overlap (int): Overlap in chunks to be used for indexing
121+
reindex (bool, optional): Flag to denote if document should be
122+
re-indexed if its already indexed. Defaults to False.
123+
file_hash (Optional[str], optional): SHA256 hash of the file.
124+
Defaults to None. If None, the hash is generated.
125+
output_file_path (Optional[str], optional): File path to write
126+
the extracted contents into. Defaults to None.
127+
128+
Returns:
129+
str: A unique ID for the file and indexing arguments combination
130+
"""
110131
# Make file content hash if not available
111132
if not file_hash:
112133
file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
@@ -117,7 +138,9 @@ def index_file(
117138
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
118139
adapter_instance_id=x2text_adapter
119140
)
120-
extracted_text = x2text_adapter_inst.process(input_file_path=file_path, output_file_path=output_file_path)
141+
extracted_text = x2text_adapter_inst.process(
142+
input_file_path=file_path, output_file_path=output_file_path
143+
)
121144
full_text.append(
122145
{
123146
"section": "full",

tests/test_x2text.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from dotenv import load_dotenv
88
from parameterized import parameterized
9-
from unstract.adapters.x2text.constants import LLMWhispererSupportedModes
109

1110
from unstract.sdk.tool.base import BaseTool
1211
from unstract.sdk.x2txt import X2Text
@@ -53,9 +52,7 @@ def test_get_x2text(self, adapter_instance_id):
5352

5453
if os.path.isfile(output_file):
5554
os.remove(output_file)
56-
file_content = x2text.process(
57-
input_file, output_file, mode=LLMWhispererSupportedModes.OCR.value
58-
)
55+
file_content = x2text.process(input_file, output_file)
5956
file_size = os.path.getsize(output_file)
6057
self.assertGreater(file_size, 0)
6158

0 commit comments

Comments
 (0)