Skip to content

Commit c48e21f

Browse files
feat: Measure indexing time separately (#107)
* Measure indexing time separately * Measure time for answer prompt API * README update on development with SDK, minor PR comment addressed
1 parent 4b44c02 commit c48e21f

File tree

3 files changed

+136
-74
lines changed

3 files changed

+136
-74
lines changed

README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,50 @@ Supported commands:
5757
Unstract SDK 0.3.2 uses the following version of Llama
5858
Index Version **0.9.28** as on January 14th, 2024
5959

60+
### Developing with the SDK
61+
62+
Ensure that you have all the required dependencies and pre-commit hooks installed
63+
```shell
64+
pdm install
65+
pre-commit install
66+
```
67+
68+
Once the changes have been made, it can be tested with [Unstract](https://github.com/Zipstack/unstract) through the following means.
69+
70+
#### With PDM
71+
Specify the SDK as a dependency to a project using a tool like `pdm` by adding the following to your `pyproject.toml`
72+
73+
```toml
74+
[tool.pdm.dev-dependencies]
75+
local_copies = [
76+
"-e unstract-adapters @ file:///${UNSTRACT_ADAPTERS_PATH}",
77+
"-e unstract-sdk @ file:///${UNSTRACT_SDK_PATH}",
78+
]
79+
```
80+
Or by running the below command
81+
```shell
82+
pdm add -e /path/to/unstract-sdk --dev
83+
```
84+
85+
#### With pip
86+
- If the project is using `pip` it might be possible to add it as a dependency in `requirements.txt`
87+
```
88+
-e /path/to/unstract-sdk
89+
```
90+
NOTE: Building locally might require the below section to be replaced in the `unstract-sdk`'s build system configuration
91+
```
92+
[build-system]
93+
requires = ["setuptools", "wheel"]
94+
build-backend = "setuptools.build_meta"
95+
```
96+
- Another option is to provide a git URL in `requirements.txt`, this can come in handy while building tool
97+
docker images. Don't forget to run `apt install git` within the `Dockerfile` for this
98+
```shell
99+
unstract-sdk @ git+https://github.com/Zipstack/unstract-sdk@feature-branch
100+
```
101+
102+
- Or try installing a [local PyPI server](https://pypi.org/project/pypiserver/) and upload / download your package from this server
103+
60104
### Environment variables required for various LLMs (deprecated)
61105

62106
- Azure OpenAI

src/unstract/sdk/index.py

Lines changed: 89 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def extract_text(
178178
logger.error(f"Error occured inside function 'process_text': {e}")
179179
return extracted_text
180180

181-
@log_elapsed(operation="INDEXING(might include EXTRACTION)")
181+
@log_elapsed(operation="CHECK_AND_INDEX(overall)")
182182
def index(
183183
self,
184184
tool_id: str,
@@ -293,82 +293,101 @@ def index(
293293
if not extracted_text:
294294
raise IndexingError("No text available to index")
295295

296-
full_text = [
297-
{
298-
"section": "full",
299-
"text_contents": extracted_text,
300-
}
301-
]
302-
303-
# Check if chunking is required
304-
documents = []
305-
for item in full_text:
306-
text = item["text_contents"]
307-
self.tool.stream_log("Indexing file...")
308-
document = Document(
309-
text=text,
310-
doc_id=doc_id,
311-
metadata={"section": item["section"]},
312-
)
313-
document.id_ = doc_id
314-
documents.append(document)
315-
self.tool.stream_log(f"Number of documents: {len(documents)}")
316-
317-
if doc_id_found:
318-
# Delete the nodes for the doc_id
319-
try:
320-
vector_db.delete(ref_doc_id=doc_id)
321-
self.tool.stream_log(f"Deleted nodes for {doc_id}")
322-
except Exception as e:
323-
self.tool.stream_log(
324-
f"Error deleting nodes for {doc_id}: {e}",
325-
level=LogLevel.ERROR,
326-
)
327-
raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e
296+
self.index_to_vector_db(
297+
vector_db=vector_db,
298+
embedding=embedding,
299+
chunk_size=chunk_size,
300+
chunk_overlap=chunk_overlap,
301+
doc_id=doc_id,
302+
text_to_idx=extracted_text,
303+
doc_id_found=doc_id_found,
304+
)
305+
return doc_id
306+
finally:
307+
vector_db.close()
308+
309+
@log_elapsed(operation="INDEXING")
310+
def index_to_vector_db(
311+
self,
312+
vector_db: VectorDB,
313+
embedding: Embedding,
314+
chunk_size: int,
315+
chunk_overlap: int,
316+
text_to_idx: str,
317+
doc_id: str,
318+
doc_id_found: bool,
319+
):
320+
self.tool.stream_log("Indexing file...")
321+
full_text = [
322+
{
323+
"section": "full",
324+
"text_contents": text_to_idx,
325+
}
326+
]
327+
# Check if chunking is required
328+
documents = []
329+
for item in full_text:
330+
text = item["text_contents"]
331+
document = Document(
332+
text=text,
333+
doc_id=doc_id,
334+
metadata={"section": item["section"]},
335+
)
336+
document.id_ = doc_id
337+
documents.append(document)
338+
self.tool.stream_log(f"Number of documents: {len(documents)}")
328339

340+
if doc_id_found:
341+
# Delete the nodes for the doc_id
329342
try:
330-
if chunk_size == 0:
331-
parser = SentenceSplitter.from_defaults(
332-
chunk_size=len(documents[0].text) + 10,
333-
chunk_overlap=0,
334-
callback_manager=embedding.get_callback_manager(),
335-
)
336-
nodes = parser.get_nodes_from_documents(
337-
documents, show_progress=True
338-
)
339-
node = nodes[0]
340-
node.embedding = embedding.get_query_embedding(" ")
341-
vector_db.add(doc_id, nodes=[node])
342-
self.tool.stream_log("Added node to vector db")
343-
else:
344-
self.tool.stream_log("Adding nodes to vector db...")
345-
# TODO: Phase 2:
346-
# Post insertion to VDB, use query using doc_id and
347-
# store all the VDB ids to a table against the doc_id
348-
# During deletion for cases where metadata filtering
349-
# does not work, these ids can be used for direct deletion
350-
# This new table will also act like an audit trail for
351-
# all nodes that were added to the VDB by Unstract
352-
# Once this is in place, the overridden implementation
353-
# of prefixing ids with doc_id before adding to VDB
354-
# can be removed
355-
vector_db.index_document(
356-
documents,
357-
chunk_size=chunk_size,
358-
chunk_overlap=chunk_overlap,
359-
show_progress=True,
360-
)
343+
vector_db.delete(ref_doc_id=doc_id)
344+
self.tool.stream_log(f"Deleted nodes for {doc_id}")
361345
except Exception as e:
362346
self.tool.stream_log(
363-
f"Error adding nodes to vector db: {e}",
347+
f"Error deleting nodes for {doc_id}: {e}",
364348
level=LogLevel.ERROR,
365349
)
366-
raise IndexingError(str(e)) from e
350+
raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e
367351

368-
self.tool.stream_log("File has been indexed successfully")
369-
return doc_id
370-
finally:
371-
vector_db.close()
352+
try:
353+
if chunk_size == 0:
354+
parser = SentenceSplitter.from_defaults(
355+
chunk_size=len(documents[0].text) + 10,
356+
chunk_overlap=0,
357+
callback_manager=embedding.get_callback_manager(),
358+
)
359+
nodes = parser.get_nodes_from_documents(documents, show_progress=True)
360+
node = nodes[0]
361+
node.embedding = embedding.get_query_embedding(" ")
362+
vector_db.add(doc_id, nodes=[node])
363+
self.tool.stream_log("Added node to vector db")
364+
else:
365+
self.tool.stream_log("Adding nodes to vector db...")
366+
# TODO: Phase 2:
367+
# Post insertion to VDB, use query using doc_id and
368+
# store all the VDB ids to a table against the doc_id
369+
# During deletion for cases where metadata filtering
370+
# does not work, these ids can be used for direct deletion
371+
# This new table will also act like an audit trail for
372+
# all nodes that were added to the VDB by Unstract
373+
# Once this is in place, the overridden implementation
374+
# of prefixing ids with doc_id before adding to VDB
375+
# can be removed
376+
vector_db.index_document(
377+
documents,
378+
chunk_size=chunk_size,
379+
chunk_overlap=chunk_overlap,
380+
show_progress=True,
381+
)
382+
except Exception as e:
383+
self.tool.stream_log(
384+
f"Error adding nodes to vector db: {e}",
385+
level=LogLevel.ERROR,
386+
)
387+
raise IndexingError(str(e)) from e
388+
389+
self.tool.stream_log("File has been indexed successfully")
390+
return
372391

373392
def generate_index_key(
374393
self,

src/unstract/sdk/prompt.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from unstract.sdk.constants import LogLevel, PromptStudioKeys, ToolEnv
88
from unstract.sdk.helper import SdkHelper
99
from unstract.sdk.tool.base import BaseTool
10+
from unstract.sdk.utils.common_utils import log_elapsed
1011

1112
logger = logging.getLogger(__name__)
1213

@@ -33,6 +34,7 @@ def __init__(
3334
if not is_public_call:
3435
self.bearer_token = tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY)
3536

37+
@log_elapsed(operation="ANSWER_PROMPTS")
3638
def answer_prompt(
3739
self, payload: dict[str, Any], params: Optional[dict[str, str]] = None
3840
) -> dict[str, Any]:
@@ -97,10 +99,7 @@ def _post_call(
9799
response: Response = Response()
98100
try:
99101
response = requests.post(
100-
url=url,
101-
json=payload,
102-
params=params,
103-
headers=headers
102+
url=url, json=payload, params=params, headers=headers
104103
)
105104
response.raise_for_status()
106105
result["status"] = "OK"

0 commit comments

Comments
 (0)