Skip to content

Commit 41525a4

Browse files
fix: Introduced new index_document() to fix chunking related issue (#96)
* Introduced new index_document() to fix chunking related issue * Minor deprecated version correction
1 parent 0a6ebd2 commit 41525a4

File tree

3 files changed

+36
-15
lines changed

3 files changed

+36
-15
lines changed

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.47.0"
1+
__version__ = "0.48.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/index.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from deprecated import deprecated
66
from llama_index.core import Document
7-
from llama_index.core.node_parser import SimpleNodeParser
7+
from llama_index.core.node_parser import SentenceSplitter
88
from llama_index.core.vector_stores import (
99
FilterOperator,
1010
MetadataFilter,
@@ -199,7 +199,8 @@ def index(
199199
self.tool.stream_log(f"No nodes found for {doc_id}")
200200
except Exception as e:
201201
self.tool.stream_log(
202-
f"Error querying {vector_db_instance_id}: {e}", level=LogLevel.ERROR
202+
f"Error querying {vector_db_instance_id}: {e}, proceeding to index",
203+
level=LogLevel.ERROR,
203204
)
204205

205206
if doc_id_found and reindex:
@@ -288,7 +289,7 @@ def index(
288289

289290
try:
290291
if chunk_size == 0:
291-
parser = SimpleNodeParser.from_defaults(
292+
parser = SentenceSplitter.from_defaults(
292293
chunk_size=len(documents[0].text) + 10,
293294
chunk_overlap=0,
294295
callback_manager=embedding.get_callback_manager(),
@@ -301,12 +302,6 @@ def index(
301302
vector_db.add(doc_id, nodes=[node])
302303
self.tool.stream_log("Added node to vector db")
303304
else:
304-
storage_context = vector_db.get_storage_context()
305-
parser = SimpleNodeParser.from_defaults(
306-
chunk_size=chunk_size,
307-
chunk_overlap=chunk_overlap,
308-
callback_manager=embedding.get_callback_manager(),
309-
)
310305
self.tool.stream_log("Adding nodes to vector db...")
311306
# TODO: Phase 2:
312307
# Post insertion to VDB, use query using doc_id and
@@ -318,13 +313,11 @@ def index(
318313
# Once this is in place, the overridden implementation
319314
# of prefixing ids with doc_id before adding to VDB
320315
# can be removed
321-
vector_db.get_vector_store_index_from_storage_context(
316+
vector_db.index_document(
322317
documents,
323-
storage_context=storage_context,
318+
chunk_size=chunk_size,
319+
chunk_overlap=chunk_overlap,
324320
show_progress=True,
325-
embed_model=embedding,
326-
node_parser=parser,
327-
callback_manager=embedding.get_callback_manager(),
328321
)
329322
except Exception as e:
330323
self.tool.stream_log(

src/unstract/sdk/vector_db.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from deprecated import deprecated
66
from llama_index.core import StorageContext, VectorStoreIndex
77
from llama_index.core.indices.base import IndexType
8+
from llama_index.core.node_parser import SentenceSplitter
89
from llama_index.core.schema import BaseNode, Document
910
from llama_index.core.vector_stores.types import (
1011
BasePydanticVectorStore,
@@ -119,6 +120,33 @@ def _get_vector_db(self) -> Union[BasePydanticVectorStore, VectorStore]:
119120
)
120121
raise VectorDBError(f"Error getting vectorDB instance: {e}") from e
121122

123+
def index_document(
124+
self,
125+
documents: Sequence[Document],
126+
chunk_size: int = 1024,
127+
chunk_overlap: int = 128,
128+
show_progress: bool = False,
129+
**index_kwargs,
130+
) -> IndexType:
131+
if not self._embedding_instance:
132+
raise VectorDBError(self.EMBEDDING_INSTANCE_ERROR)
133+
storage_context = self.get_storage_context()
134+
parser = SentenceSplitter.from_defaults(
135+
chunk_size=chunk_size,
136+
chunk_overlap=chunk_overlap,
137+
callback_manager=self._embedding_instance.callback_manager,
138+
)
139+
return VectorStoreIndex.from_documents(
140+
documents,
141+
storage_context=storage_context,
142+
show_progress=show_progress,
143+
embed_model=self._embedding_instance,
144+
transformations=[parser],
145+
callback_manager=self._embedding_instance.callback_manager,
146+
**index_kwargs,
147+
)
148+
149+
@deprecated(version="0.47.0", reason="Use index_document() instead")
122150
def get_vector_store_index_from_storage_context(
123151
self,
124152
documents: Sequence[Document],

0 commit comments

Comments
 (0)