|
| 1 | +import json |
| 2 | +import os |
| 3 | +import time |
| 4 | +from pathlib import Path |
| 5 | +from typing import Generator |
| 6 | +from uuid import uuid4 |
| 7 | + |
| 8 | +import pytest |
| 9 | +import requests |
| 10 | + |
| 11 | +from test.integration.connectors.utils.constants import DESTINATION_TAG |
| 12 | +from test.integration.utils import requires_env |
| 13 | +from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers |
| 14 | +from unstructured_ingest.v2.logger import logger |
| 15 | +from unstructured_ingest.v2.processes.connectors.vectara import ( |
| 16 | + CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE, |
| 17 | +) |
| 18 | +from unstructured_ingest.v2.processes.connectors.vectara import ( |
| 19 | + VectaraAccessConfig, |
| 20 | + VectaraConnectionConfig, |
| 21 | + VectaraUploader, |
| 22 | + VectaraUploaderConfig, |
| 23 | + VectaraUploadStager, |
| 24 | + VectaraUploadStagerConfig, |
| 25 | +) |
| 26 | + |
| 27 | + |
| 28 | +def validate_upload(response: dict, expected_data: dict): |
| 29 | + element_id = expected_data["element_id"] |
| 30 | + expected_text = expected_data["text"] |
| 31 | + filename = expected_data["metadata"]["filename"] |
| 32 | + filetype = expected_data["metadata"]["filetype"] |
| 33 | + page_number = expected_data["metadata"]["page_number"] |
| 34 | + |
| 35 | + response = response["search_results"][0] |
| 36 | + |
| 37 | + assert response is not None |
| 38 | + assert response["text"] == expected_text |
| 39 | + assert response["part_metadata"]["element_id"] == element_id |
| 40 | + assert response["part_metadata"]["filename"] == filename |
| 41 | + assert response["part_metadata"]["filetype"] == filetype |
| 42 | + assert response["part_metadata"]["page_number"] == page_number |
| 43 | + |
| 44 | + |
| 45 | +@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID") |
| 46 | +def _get_jwt_token(): |
| 47 | + """Connect to the server and get a JWT token.""" |
| 48 | + customer_id = os.environ["VECTARA_CUSTOMER_ID"] |
| 49 | + token_endpoint = ( |
| 50 | + f"https://vectara-prod-{customer_id}.auth.us-west-2.amazoncognito.com/oauth2/token" |
| 51 | + ) |
| 52 | + headers = { |
| 53 | + "Content-Type": "application/x-www-form-urlencoded", |
| 54 | + } |
| 55 | + data = { |
| 56 | + "grant_type": "client_credentials", |
| 57 | + "client_id": os.environ["VECTARA_OAUTH_CLIENT_ID"], |
| 58 | + "client_secret": os.environ["VECTARA_OAUTH_SECRET"], |
| 59 | + } |
| 60 | + |
| 61 | + response = requests.post(token_endpoint, headers=headers, data=data) |
| 62 | + response.raise_for_status() |
| 63 | + response_json = response.json() |
| 64 | + |
| 65 | + return response_json.get("access_token") |
| 66 | + |
| 67 | + |
| 68 | +def query_data(corpus_key: str, element_id: str) -> dict: |
| 69 | + |
| 70 | + url = f"https://api.vectara.io/v2/corpora/{corpus_key}/query" |
| 71 | + |
| 72 | + # the query below requires the corpus to have filter attributes for element_id |
| 73 | + |
| 74 | + data = json.dumps( |
| 75 | + { |
| 76 | + "query": "string", |
| 77 | + "search": { |
| 78 | + "metadata_filter": f"part.element_id = '{element_id}'", |
| 79 | + "lexical_interpolation": 1, |
| 80 | + "limit": 10, |
| 81 | + }, |
| 82 | + } |
| 83 | + ) |
| 84 | + |
| 85 | + jwt_token = _get_jwt_token() |
| 86 | + headers = { |
| 87 | + "Content-Type": "application/json", |
| 88 | + "Accept": "application/json", |
| 89 | + "Authorization": f"Bearer {jwt_token}", |
| 90 | + "X-source": "unstructured", |
| 91 | + } |
| 92 | + |
| 93 | + response = requests.post(url, headers=headers, data=data) |
| 94 | + response.raise_for_status() |
| 95 | + response_json = response.json() |
| 96 | + |
| 97 | + return response_json |
| 98 | + |
| 99 | + |
| 100 | +def create_corpora(corpus_key: str, corpus_name: str) -> None: |
| 101 | + url = "https://api.vectara.io/v2/corpora" |
| 102 | + data = json.dumps({"key": corpus_key, "name": corpus_name, "description": "integration test"}) |
| 103 | + jwt_token = _get_jwt_token() |
| 104 | + headers = { |
| 105 | + "Content-Type": "application/json", |
| 106 | + "Accept": "application/json", |
| 107 | + "Authorization": f"Bearer {jwt_token}", |
| 108 | + "X-source": "unstructured", |
| 109 | + } |
| 110 | + |
| 111 | + response = requests.post(url, headers=headers, data=data) |
| 112 | + response.raise_for_status() |
| 113 | + |
| 114 | + |
| 115 | +def replace_filter_attributes(corpus_key: str) -> None: |
| 116 | + url = f"https://api.vectara.io/v2/corpora/{corpus_key}/replace_filter_attributes" |
| 117 | + data = json.dumps( |
| 118 | + { |
| 119 | + "filter_attributes": [ |
| 120 | + {"name": "element_id", "level": "part", "indexed": True, "type": "text"} |
| 121 | + ] |
| 122 | + } |
| 123 | + ) |
| 124 | + jwt_token = _get_jwt_token() |
| 125 | + headers = { |
| 126 | + "Content-Type": "application/json", |
| 127 | + "Accept": "application/json", |
| 128 | + "Authorization": f"Bearer {jwt_token}", |
| 129 | + "X-source": "unstructured", |
| 130 | + } |
| 131 | + |
| 132 | + response = requests.post(url, headers=headers, data=data) |
| 133 | + response.raise_for_status() |
| 134 | + |
| 135 | + |
| 136 | +def delete_corpora(corpus_key: str) -> None: |
| 137 | + url = f"https://api.vectara.io/v2/corpora/{corpus_key}" |
| 138 | + |
| 139 | + jwt_token = _get_jwt_token() |
| 140 | + headers = { |
| 141 | + "Content-Type": "application/json", |
| 142 | + "Accept": "application/json", |
| 143 | + "Authorization": f"Bearer {jwt_token}", |
| 144 | + "X-source": "unstructured", |
| 145 | + } |
| 146 | + |
| 147 | + response = requests.delete(url, headers=headers) |
| 148 | + response.raise_for_status() |
| 149 | + |
| 150 | + |
| 151 | +def list_corpora() -> list: |
| 152 | + url = "https://api.vectara.io/v2/corpora?limit=100" |
| 153 | + jwt_token = _get_jwt_token() |
| 154 | + headers = { |
| 155 | + "Content-Type": "application/json", |
| 156 | + "Accept": "application/json", |
| 157 | + "Authorization": f"Bearer {jwt_token}", |
| 158 | + "X-source": "unstructured", |
| 159 | + } |
| 160 | + response = requests.get(url, headers=headers) |
| 161 | + response.raise_for_status() |
| 162 | + response_json = response.json() |
| 163 | + if response_json.get("corpora"): |
| 164 | + return [item["key"] for item in response_json.get("corpora")] |
| 165 | + else: |
| 166 | + return [] |
| 167 | + |
| 168 | + |
| 169 | +def wait_for_ready(corpus_key: str, timeout=60, interval=2) -> None: |
| 170 | + def is_ready_status(): |
| 171 | + corpora_list = list_corpora() |
| 172 | + return corpus_key in corpora_list |
| 173 | + |
| 174 | + start = time.time() |
| 175 | + is_ready = is_ready_status() |
| 176 | + while not is_ready and time.time() - start < timeout: |
| 177 | + time.sleep(interval) |
| 178 | + is_ready = is_ready_status() |
| 179 | + if not is_ready: |
| 180 | + raise TimeoutError("time out waiting for corpus to be ready") |
| 181 | + |
| 182 | + |
| 183 | +def wait_for_delete(corpus_key: str, timeout=60, interval=2) -> None: |
| 184 | + start = time.time() |
| 185 | + while time.time() - start < timeout: |
| 186 | + corpora_list = list_corpora() |
| 187 | + if corpus_key not in corpora_list: |
| 188 | + return |
| 189 | + time.sleep(interval) |
| 190 | + |
| 191 | + raise TimeoutError("time out waiting for corpus to delete") |
| 192 | + |
| 193 | + |
| 194 | +@pytest.fixture |
| 195 | +def corpora_util() -> Generator[str, None, None]: |
| 196 | + random_id = str(uuid4()).split("-")[0] |
| 197 | + corpus_key = f"ingest-test-{random_id}" |
| 198 | + corpus_name = "ingest-test" |
| 199 | + logger.info(f"Creating corpus with key: {corpus_key}") |
| 200 | + try: |
| 201 | + create_corpora(corpus_key, corpus_name) |
| 202 | + replace_filter_attributes(corpus_key) |
| 203 | + wait_for_ready(corpus_key=corpus_key) |
| 204 | + yield corpus_key |
| 205 | + except Exception as e: |
| 206 | + logger.error(f"failed to create corpus {corpus_key}: {e}") |
| 207 | + finally: |
| 208 | + logger.info(f"deleting corpus: {corpus_key}") |
| 209 | + delete_corpora(corpus_key) |
| 210 | + wait_for_delete(corpus_key=corpus_key) |
| 211 | + |
| 212 | + |
| 213 | +@pytest.mark.asyncio |
| 214 | +@pytest.mark.tags(VECTARA_CONNECTOR_TYPE, DESTINATION_TAG, "vectara") |
| 215 | +@requires_env("VECTARA_OAUTH_CLIENT_ID", "VECTARA_OAUTH_SECRET", "VECTARA_CUSTOMER_ID") |
| 216 | +async def test_vectara_destination( |
| 217 | + upload_file: Path, tmp_path: Path, corpora_util: str, retries=30, interval=10 |
| 218 | +): |
| 219 | + corpus_key = corpora_util |
| 220 | + connection_kwargs = { |
| 221 | + "customer_id": os.environ["VECTARA_CUSTOMER_ID"], |
| 222 | + "corpus_key": corpus_key, |
| 223 | + } |
| 224 | + |
| 225 | + oauth_client_id = os.environ["VECTARA_OAUTH_CLIENT_ID"] |
| 226 | + oauth_secret = os.environ["VECTARA_OAUTH_SECRET"] |
| 227 | + |
| 228 | + file_data = FileData( |
| 229 | + source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name), |
| 230 | + connector_type=VECTARA_CONNECTOR_TYPE, |
| 231 | + identifier="mock-file-data", |
| 232 | + ) |
| 233 | + |
| 234 | + stager_config = VectaraUploadStagerConfig(batch_size=10) |
| 235 | + stager = VectaraUploadStager(upload_stager_config=stager_config) |
| 236 | + new_upload_file = stager.run( |
| 237 | + elements_filepath=upload_file, |
| 238 | + output_dir=tmp_path, |
| 239 | + output_filename=upload_file.name, |
| 240 | + file_data=file_data, |
| 241 | + ) |
| 242 | + |
| 243 | + uploader = VectaraUploader( |
| 244 | + connection_config=VectaraConnectionConfig( |
| 245 | + **connection_kwargs, |
| 246 | + access_config=VectaraAccessConfig( |
| 247 | + oauth_client_id=oauth_client_id, oauth_secret=oauth_secret |
| 248 | + ), |
| 249 | + ), |
| 250 | + upload_config=VectaraUploaderConfig(), |
| 251 | + ) |
| 252 | + |
| 253 | + with new_upload_file.open() as new_upload_fp: |
| 254 | + elements_stager = json.load(new_upload_fp) |
| 255 | + |
| 256 | + if uploader.is_async(): |
| 257 | + await uploader.run_data_async(data=elements_stager, file_data=file_data) |
| 258 | + |
| 259 | + with upload_file.open() as upload_fp: |
| 260 | + elements = json.load(upload_fp) |
| 261 | + first_element = elements[0] |
| 262 | + |
| 263 | + for i in range(retries): |
| 264 | + response = query_data(corpus_key, first_element["element_id"]) |
| 265 | + if not response["search_results"]: |
| 266 | + time.sleep(interval) |
| 267 | + else: |
| 268 | + break |
| 269 | + |
| 270 | + validate_upload(response=response, expected_data=first_element) |
0 commit comments