Skip to content

Commit 5914bd9

Browse files
feat/vectara-destination-to-v2 (#158)
* vectara v2 still work in progress * takingn errors out * fix lint * linting * order imports * ruff --fix * fix version * fixing PR issues by Roman * ruff * wrking get metadata on stager * remove coments * changing conform dict to the correct way and including vdoc on stager part * taking path out from file by Potter chat to eave only url, even though if empty * fix potter comments * change version * version update * make tidy * add secret to access config * make tidy * . * get secret value * change version * add async * change import * vectara requirements * make tidy * mt * fix PR comments * vectara example to be able to debug async * no wait worn * improving logging * precheck without async. * mke tidy * lint * linting * some fixes vectara * migrate to vectara v2 api * add integration test for vectara * fix syntax * divide elements to batches * Add retry logic to document query * change integration test to regular function * clean up corpus after integration test * fix syntax error * update connection config in example * Remove unnecessary var * remove batch_size because Vectara api does not support batch indexing documents * remove asyncio.run to avoid conflict with async context * update stager to reflect new structure * update uploader to reflect new structure * fix syntax --------- Co-authored-by: Bryan Chen <[email protected]>
1 parent a0923db commit 5914bd9

File tree

8 files changed

+688
-2
lines changed

8 files changed

+688
-2
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.3.12-dev3
2+
3+
### Enhancements
4+
5+
* **Migrate Vectara Destination Connector to v2**
6+
17
## 0.3.12-dev2
28

39
### Enhancements
@@ -20,7 +26,6 @@
2026
* **Create more reflective custom errors** Provide errors to indicate if the error was due to something user provided or due to a provider issue, applicable to all steps in the pipeline.
2127
* **Bypass asyncio exception grouping to return more meaningful errors from OneDrive indexer**
2228

23-
2429
## 0.3.11
2530

2631
### Enhancements

requirements/connectors/vectara.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
-c ../common/constraints.txt
22

33
requests
4+
aiofiles
5+
httpx

requirements/connectors/vectara.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ idna==3.10
88
# via requests
99
requests==2.32.3
1010
# via -r ./connectors/vectara.in
11+
aiofiles==24.1.0
12+
# via -r ./connectors/vectara.in
1113
urllib3==1.26.20
1214
# via
1315
# -c ./connectors/../common/constraints.txt
Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
import json
2+
import os
3+
import time
4+
from pathlib import Path
5+
from typing import Generator
6+
from uuid import uuid4
7+
8+
import pytest
9+
import requests
10+
11+
from test.integration.connectors.utils.constants import DESTINATION_TAG
12+
from test.integration.utils import requires_env
13+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
14+
from unstructured_ingest.v2.logger import logger
15+
from unstructured_ingest.v2.processes.connectors.vectara import (
16+
CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE,
17+
)
18+
from unstructured_ingest.v2.processes.connectors.vectara import (
19+
VectaraAccessConfig,
20+
VectaraConnectionConfig,
21+
VectaraUploader,
22+
VectaraUploaderConfig,
23+
VectaraUploadStager,
24+
VectaraUploadStagerConfig,
25+
)
26+
27+
28+
def validate_upload(response: dict, expected_data: dict):
29+
element_id = expected_data["element_id"]
30+
expected_text = expected_data["text"]
31+
filename = expected_data["metadata"]["filename"]
32+
filetype = expected_data["metadata"]["filetype"]
33+
page_number = expected_data["metadata"]["page_number"]
34+
35+
response = response["search_results"][0]
36+
37+
assert response is not None
38+
assert response["text"] == expected_text
39+
assert response["part_metadata"]["element_id"] == element_id
40+
assert response["part_metadata"]["filename"] == filename
41+
assert response["part_metadata"]["filetype"] == filetype
42+
assert response["part_metadata"]["page_number"] == page_number
43+
44+
45+
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
46+
def _get_jwt_token():
47+
"""Connect to the server and get a JWT token."""
48+
customer_id = os.environ["VECTARA_CUSTOMER_ID"]
49+
token_endpoint = (
50+
f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token"
51+
)
52+
headers = {
53+
"Content-Type": "application/x-www-form-urlencoded",
54+
}
55+
data = {
56+
"grant_type": "client_credentials",
57+
"client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"],
58+
"client_secret": os.environ["VECTARA_OAUTH_SECRET"],
59+
}
60+
61+
response = requests.post(token_endpoint, headers=headers, data=data)
62+
response.raise_for_status()
63+
response_json = response.json()
64+
65+
return response_json.get("access_token")
66+
67+
68+
def query_data(corpus_key: str, element_id: str) -> dict:
69+
70+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query"
71+
72+
# the query below requires the corpus to have filter attributes for element_id
73+
74+
data = json.dumps(
75+
{
76+
"query": "string",
77+
"search": {
78+
"metadata_filter": f"part.element_id = '{element_id}'",
79+
"lexical_interpolation": 1,
80+
"limit": 10,
81+
},
82+
}
83+
)
84+
85+
jwt_token = _get_jwt_token()
86+
headers = {
87+
"Content-Type": "application/json",
88+
"Accept": "application/json",
89+
"Authorization": f"Bearer {jwt_token}",
90+
"X-source": "unstructured",
91+
}
92+
93+
response = requests.post(url, headers=headers, data=data)
94+
response.raise_for_status()
95+
response_json = response.json()
96+
97+
return response_json
98+
99+
100+
def create_corpora(corpus_key: str, corpus_name: str) -> None:
101+
url = "https://api.vectara.io/v2/corpora"
102+
data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"})
103+
jwt_token = _get_jwt_token()
104+
headers = {
105+
"Content-Type": "application/json",
106+
"Accept": "application/json",
107+
"Authorization": f"Bearer {jwt_token}",
108+
"X-source": "unstructured",
109+
}
110+
111+
response = requests.post(url, headers=headers, data=data)
112+
response.raise_for_status()
113+
114+
115+
def replace_filter_attributes(corpus_key: str) -> None:
116+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes"
117+
data = json.dumps(
118+
{
119+
"filter_attributes": [
120+
{"name": "element_id", "level": "part", "indexed": True, "type": "text"}
121+
]
122+
}
123+
)
124+
jwt_token = _get_jwt_token()
125+
headers = {
126+
"Content-Type": "application/json",
127+
"Accept": "application/json",
128+
"Authorization": f"Bearer {jwt_token}",
129+
"X-source": "unstructured",
130+
}
131+
132+
response = requests.post(url, headers=headers, data=data)
133+
response.raise_for_status()
134+
135+
136+
def delete_corpora(corpus_key: str) -> None:
137+
url = f"https://api.vectara.io/v2/corpora/{corpus_key}"
138+
139+
jwt_token = _get_jwt_token()
140+
headers = {
141+
"Content-Type": "application/json",
142+
"Accept": "application/json",
143+
"Authorization": f"Bearer {jwt_token}",
144+
"X-source": "unstructured",
145+
}
146+
147+
response = requests.delete(url, headers=headers)
148+
response.raise_for_status()
149+
150+
151+
def list_corpora() -> list:
152+
url = "https://api.vectara.io/v2/corpora?limit=100"
153+
jwt_token = _get_jwt_token()
154+
headers = {
155+
"Content-Type": "application/json",
156+
"Accept": "application/json",
157+
"Authorization": f"Bearer {jwt_token}",
158+
"X-source": "unstructured",
159+
}
160+
response = requests.get(url, headers=headers)
161+
response.raise_for_status()
162+
response_json = response.json()
163+
if response_json.get("corpora"):
164+
return [item["key"] for item in response_json.get("corpora")]
165+
else:
166+
return []
167+
168+
169+
def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None:
170+
def is_ready_status():
171+
corpora_list = list_corpora()
172+
return corpus_key in corpora_list
173+
174+
start = time.time()
175+
is_ready = is_ready_status()
176+
while not is_ready and time.time() - start < timeout:
177+
time.sleep(interval)
178+
is_ready = is_ready_status()
179+
if not is_ready:
180+
raise TimeoutError("time out waiting for corpus to be ready")
181+
182+
183+
def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None:
184+
start = time.time()
185+
while time.time() - start < timeout:
186+
corpora_list = list_corpora()
187+
if corpus_key not in corpora_list:
188+
return
189+
time.sleep(interval)
190+
191+
raise TimeoutError("time out waiting for corpus to delete")
192+
193+
194+
@pytest.fixture
195+
def corpora_util() -> Generator[str, None, None]:
196+
random_id = str(uuid4()).split("-")[0]
197+
corpus_key = f"ingest-test-{random_id}"
198+
corpus_name = "ingest-test"
199+
logger.info(f"Creating corpus with key: {corpus_key}")
200+
try:
201+
create_corpora(corpus_key, corpus_name)
202+
replace_filter_attributes(corpus_key)
203+
wait_for_ready(corpus_key=corpus_key)
204+
yield corpus_key
205+
except Exception as e:
206+
logger.error(f"failed to create corpus {corpus_key}: {e}")
207+
finally:
208+
logger.info(f"deleting corpus: {corpus_key}")
209+
delete_corpora(corpus_key)
210+
wait_for_delete(corpus_key=corpus_key)
211+
212+
213+
@pytest.mark.asyncio
214+
@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara")
215+
@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID")
216+
async def test_vectara_destination(
217+
upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10
218+
):
219+
corpus_key = corpora_util
220+
connection_kwargs = {
221+
"customer_id": os.environ["VECTARA_CUSTOMER_ID"],
222+
"corpus_key": corpus_key,
223+
}
224+
225+
oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"]
226+
oauth_secret = os.environ["VECTARA_OAUTH_SECRET"]
227+
228+
file_data = FileData(
229+
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
230+
connector_type=VECTARA_CONNECTOR_TYPE,
231+
identifier="mock-file-data",
232+
)
233+
234+
stager_config = VectaraUploadStagerConfig(batch_size=10)
235+
stager = VectaraUploadStager(upload_stager_config=stager_config)
236+
new_upload_file = stager.run(
237+
elements_filepath=upload_file,
238+
output_dir=tmp_path,
239+
output_filename=upload_file.name,
240+
file_data=file_data,
241+
)
242+
243+
uploader = VectaraUploader(
244+
connection_config=VectaraConnectionConfig(
245+
**connection_kwargs,
246+
access_config=VectaraAccessConfig(
247+
oauth_client_id=oauth_client_id, oauth_secret=oauth_secret
248+
),
249+
),
250+
upload_config=VectaraUploaderConfig(),
251+
)
252+
253+
with new_upload_file.open() as new_upload_fp:
254+
elements_stager = json.load(new_upload_fp)
255+
256+
if uploader.is_async():
257+
await uploader.run_data_async(data=elements_stager, file_data=file_data)
258+
259+
with upload_file.open() as upload_fp:
260+
elements = json.load(upload_fp)
261+
first_element = elements[0]
262+
263+
for i in range(retries):
264+
response = query_data(corpus_key, first_element["element_id"])
265+
if not response["search_results"]:
266+
time.sleep(interval)
267+
else:
268+
break
269+
270+
validate_upload(response=response, expected_data=first_element)

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.12-dev2" # pragma: no cover
1+
__version__ = "0.3.12-dev3" # pragma: no cover
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from pathlib import Path
2+
3+
from unstructured_ingest.v2.interfaces import ProcessorConfig
4+
from unstructured_ingest.v2.logger import logger
5+
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
6+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
7+
from unstructured_ingest.v2.processes.connectors.local import (
8+
LocalConnectionConfig,
9+
LocalDownloaderConfig,
10+
LocalIndexerConfig,
11+
)
12+
from unstructured_ingest.v2.processes.connectors.vectara import (
13+
CONNECTOR_TYPE,
14+
VectaraAccessConfig,
15+
VectaraConnectionConfig,
16+
VectaraUploaderConfig,
17+
VectaraUploadStagerConfig,
18+
)
19+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
20+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
21+
22+
base_path = Path(__file__).parent.parent.parent.parent
23+
docs_path = base_path / "example-docs"
24+
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25+
output_path = work_dir / "output"
26+
download_path = work_dir / "download"
27+
28+
if __name__ == "__main__":
29+
logger.info(f"writing all content in: {work_dir.resolve()}")
30+
Pipeline.from_configs(
31+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32+
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
33+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
34+
source_connection_config=LocalConnectionConfig(),
35+
partitioner_config=PartitionerConfig(strategy="fast"),
36+
chunker_config=ChunkerConfig(
37+
chunking_strategy="by_title",
38+
chunk_include_orig_elements=False,
39+
chunk_max_characters=1500,
40+
chunk_multipage_sections=True,
41+
),
42+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43+
destination_connection_config=VectaraConnectionConfig(
44+
access_config=VectaraAccessConfig(
45+
oauth_client_id="fill oauth_client_id", oauth_secret="fill oauth_secret"
46+
),
47+
customer_id="fill customer_id",
48+
corpus_name="fill corpus_name",
49+
corpus_key="fill corpus_key",
50+
token_url="fill token_url",
51+
),
52+
stager_config=VectaraUploadStagerConfig(batch_size=10),
53+
uploader_config=VectaraUploaderConfig(),
54+
).run()

unstructured_ingest/v2/processes/connectors/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656
from .sharepoint import sharepoint_source_entry
5757
from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
5858
from .slack import slack_source_entry
59+
from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE
60+
from .vectara import vectara_destination_entry
5961

6062
add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
6163
add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -103,6 +105,7 @@
103105

104106
add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
105107

108+
add_destination_entry(destination_type=VECTARA_CONNECTOR_TYPE, entry=vectara_destination_entry)
106109
add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
107110

108111
add_destination_entry(destination_type=REDIS_CONNECTOR_TYPE, entry=redis_destination_entry)

0 commit comments

Comments
 (0)