11import json
2+ import math
23import os
34import re
45import time
1920from unstructured_ingest .v2 .logger import logger
2021from unstructured_ingest .v2 .processes .connectors .pinecone import (
2122 CONNECTOR_TYPE ,
23+ MAX_QUERY_RESULTS ,
2224 PineconeAccessConfig ,
2325 PineconeConnectionConfig ,
2426 PineconeUploader ,
@@ -118,7 +120,10 @@ def validate_pinecone_index(
118120 f"retry attempt { i } : expected { expected_num_of_vectors } != vector count { vector_count } "
119121 )
120122 time .sleep (interval )
121- assert vector_count == expected_num_of_vectors
123+ assert vector_count == expected_num_of_vectors , (
124+ f"vector count from index ({ vector_count } ) doesn't "
125+ f"match expected number: { expected_num_of_vectors } "
126+ )
122127
123128
124129@requires_env (API_KEY )
@@ -147,10 +152,7 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
147152 uploader = PineconeUploader (connection_config = connection_config , upload_config = upload_config )
148153 uploader .precheck ()
149154
150- if uploader .is_async ():
151- await uploader .run_async (path = new_upload_file , file_data = file_data )
152- else :
153- uploader .run (path = new_upload_file , file_data = file_data )
155+ uploader .run (path = new_upload_file , file_data = file_data )
154156 with new_upload_file .open () as f :
155157 staged_content = json .load (f )
156158 expected_num_of_vectors = len (staged_content )
@@ -160,10 +162,59 @@ async def test_pinecone_destination(pinecone_index: str, upload_file: Path, temp
160162 )
161163
162164 # Rerun uploader and make sure no duplicates exist
163- if uploader .is_async ():
164- await uploader .run_async (path = new_upload_file , file_data = file_data )
165- else :
166- uploader .run (path = new_upload_file , file_data = file_data )
165+ uploader .run (path = new_upload_file , file_data = file_data )
166+ logger .info ("validating second upload" )
167+ validate_pinecone_index (
168+ index_name = pinecone_index , expected_num_of_vectors = expected_num_of_vectors
169+ )
170+
171+
172+ @requires_env (API_KEY )
173+ @pytest .mark .asyncio
174+ @pytest .mark .tags (CONNECTOR_TYPE , DESTINATION_TAG )
175+ @pytest .mark .skip (reason = "TODO: get this to work" )
176+ async def test_pinecone_destination_large_index (
177+ pinecone_index : str , upload_file : Path , temp_dir : Path
178+ ):
179+ new_file = temp_dir / "large_file.json"
180+ with upload_file .open () as f :
181+ upload_content = json .load (f )
182+
183+ min_entries = math .ceil ((MAX_QUERY_RESULTS * 2 ) / len (upload_content ))
184+ new_content = (upload_content * min_entries )[: (2 * MAX_QUERY_RESULTS )]
185+ print (f"Creating large index content with { len (new_content )} records" )
186+ with new_file .open ("w" ) as f :
187+ json .dump (new_content , f )
188+
189+ expected_num_of_vectors = len (new_content )
190+ file_data = FileData (
191+ source_identifiers = SourceIdentifiers (fullpath = new_file .name , filename = new_file .name ),
192+ connector_type = CONNECTOR_TYPE ,
193+ identifier = "pinecone_mock_id" ,
194+ )
195+ connection_config = PineconeConnectionConfig (
196+ index_name = pinecone_index ,
197+ access_config = PineconeAccessConfig (api_key = get_api_key ()),
198+ )
199+ stager_config = PineconeUploadStagerConfig ()
200+ stager = PineconeUploadStager (upload_stager_config = stager_config )
201+ new_upload_file = stager .run (
202+ elements_filepath = new_file ,
203+ output_dir = temp_dir ,
204+ output_filename = new_file .name ,
205+ file_data = file_data ,
206+ )
207+
208+ upload_config = PineconeUploaderConfig ()
209+ uploader = PineconeUploader (connection_config = connection_config , upload_config = upload_config )
210+ uploader .precheck ()
211+
212+ uploader .run (path = new_upload_file , file_data = file_data )
213+ validate_pinecone_index (
214+ index_name = pinecone_index , expected_num_of_vectors = expected_num_of_vectors
215+ )
216+ # Rerun uploader and make sure no duplicates exist
217+ uploader .run (path = new_upload_file , file_data = file_data )
167218 logger .info ("validating second upload" )
168219 validate_pinecone_index (
169220 index_name = pinecone_index , expected_num_of_vectors = expected_num_of_vectors
0 commit comments