11import json
2+ import re
23from dataclasses import dataclass , field
3- from typing import TYPE_CHECKING , Any , Optional
4+ from typing import TYPE_CHECKING , Any , Literal , Optional
45
56from pydantic import Field , Secret
67
1314 AccessConfig ,
1415 ConnectionConfig ,
1516 FileData ,
16- Uploader ,
1717 UploaderConfig ,
1818 UploadStager ,
1919 UploadStagerConfig ,
20+ VectorDBUploader ,
2021)
2122from unstructured_ingest .v2 .logger import logger
2223from unstructured_ingest .v2 .processes .connector_registry import DestinationRegistryEntry
@@ -41,7 +42,7 @@ class PineconeAccessConfig(AccessConfig):
4142
4243
4344class PineconeConnectionConfig (ConnectionConfig ):
44- index_name : str = Field (description = "Name of the index to connect to." )
45+ index_name : Optional [ str ] = Field (description = "Name of the index to connect to." , default = None )
4546 access_config : Secret [PineconeAccessConfig ] = Field (
4647 default = PineconeAccessConfig (), validate_default = True
4748 )
@@ -160,18 +161,101 @@ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
160161
161162
162163@dataclass
163- class PineconeUploader (Uploader ):
164+ class PineconeUploader (VectorDBUploader ):
164165 upload_config : PineconeUploaderConfig
165166 connection_config : PineconeConnectionConfig
166167 connector_type : str = CONNECTOR_TYPE
167168
169+ def init (self , ** kwargs : Any ) -> None :
170+ self .create_destination (** kwargs )
171+
172+ def index_exists (self , index_name : Optional [str ]) -> bool :
173+ from pinecone .exceptions import NotFoundException
174+
175+ index_name = index_name or self .connection_config .index_name
176+ pc = self .connection_config .get_client ()
177+ try :
178+ pc .describe_index (index_name )
179+ return True
180+ except NotFoundException :
181+ return False
182+ except Exception as e :
183+ logger .error (f"failed to check if pinecone index exists : { e } " )
184+ raise DestinationConnectionError (f"failed to check if pinecone index exists : { e } " )
185+
168186 def precheck (self ):
169187 try :
170- self .connection_config .get_index ()
188+ # just a connection check here. not an actual index_exists check
189+ self .index_exists ("just-checking-our-connection" )
190+
191+ if self .connection_config .index_name and not self .index_exists (
192+ self .connection_config .index_name
193+ ):
194+ raise DestinationConnectionError (
195+ f"index { self .connection_config .index_name } does not exist"
196+ )
171197 except Exception as e :
172198 logger .error (f"failed to validate connection: { e } " , exc_info = True )
173199 raise DestinationConnectionError (f"failed to validate connection: { e } " )
174200
201+ def format_destination_name (self , destination_name : str ) -> str :
202+ # Pinecone naming requirements:
203+ # can only contain lowercase letters, numbers, and hyphens
204+ # must be 45 characters or less
205+ formatted = re .sub (r"[^a-z0-9]" , "-" , destination_name .lower ())
206+ return formatted
207+
208+ def create_destination (
209+ self ,
210+ vector_length : int ,
211+ destination_name : str = "elements" ,
212+ destination_type : Literal ["pod" , "serverless" ] = "serverless" ,
213+ serverless_cloud : str = "aws" ,
214+ serverless_region : str = "us-west-2" ,
215+ pod_environment : str = "us-east1-gcp" ,
216+ pod_type : str = "p1.x1" ,
217+ pod_count : int = 1 ,
218+ ** kwargs : Any ,
219+ ) -> bool :
220+ from pinecone import PodSpec , ServerlessSpec
221+
222+ index_name = destination_name or self .connection_config .index_name
223+ index_name = self .format_destination_name (index_name )
224+ self .connection_config .index_name = index_name
225+
226+ if not self .index_exists (index_name ):
227+
228+ logger .info (f"creating pinecone index { index_name } " )
229+
230+ pc = self .connection_config .get_client ()
231+
232+ if destination_type == "serverless" :
233+ pc .create_index (
234+ name = destination_name ,
235+ dimension = vector_length ,
236+ spec = ServerlessSpec (cloud = serverless_cloud , region = serverless_region ),
237+ ** kwargs ,
238+ )
239+
240+ return True
241+
242+ elif destination_type == "pod" :
243+ pc .create_index (
244+ name = destination_name ,
245+ dimension = vector_length ,
246+ spec = PodSpec (environment = pod_environment , pod_type = pod_type , pods = pod_count ),
247+ ** kwargs ,
248+ )
249+
250+ return True
251+
252+ else :
253+ raise ValueError (f"unexpected destination type: { destination_type } " )
254+
255+ else :
256+ logger .debug (f"index { index_name } already exists, skipping creation" )
257+ return False
258+
175259 def pod_delete_by_record_id (self , file_data : FileData ) -> None :
176260 logger .debug (
177261 f"deleting any content with metadata "
@@ -266,6 +350,10 @@ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None
266350 )
267351 # Determine if serverless or pod based index
268352 pinecone_client = self .connection_config .get_client ()
353+
354+ if not self .connection_config .index_name :
355+ raise ValueError ("No index name specified" )
356+
269357 index_description = pinecone_client .describe_index (name = self .connection_config .index_name )
270358 if "serverless" in index_description .get ("spec" ):
271359 self .serverless_delete_by_record_id (file_data = file_data )
0 commit comments