Skip to content

Commit db5913b

Browse files
committed
add pinecone
1 parent 1663aa4 commit db5913b

File tree

3 files changed

+207
-57
lines changed

3 files changed

+207
-57
lines changed

llm-complete-guide/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,5 +82,7 @@
8282
] # Important: large to small
8383
USE_ARGILLA_ANNOTATIONS = False
8484

85+
# Vector store secrets
8586
SECRET_NAME = os.getenv("ZENML_PROJECT_SECRET_NAME", "llm-complete")
8687
SECRET_NAME_ELASTICSEARCH = "elasticsearch-zenml"
88+
SECRET_NAME_PINECONE = "pinecone-zenml"

llm-complete-guide/steps/populate_index.py

Lines changed: 89 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
EMBEDDINGS_MODEL,
4343
SECRET_NAME,
4444
SECRET_NAME_ELASTICSEARCH,
45+
SECRET_NAME_PINECONE,
4546
)
4647
from pgvector.psycopg2 import register_vector
4748
from PIL import Image, ImageDraw, ImageFont
@@ -51,7 +52,8 @@
5152
from zenml import ArtifactConfig, log_metadata, step
5253
from zenml.client import Client
5354
from zenml.metadata.metadata_types import Uri
54-
55+
import pinecone
56+
from pinecone import Pinecone, ServerlessSpec
5557
logging.basicConfig(level=logging.INFO)
5658
logger = logging.getLogger(__name__)
5759

@@ -626,6 +628,7 @@ def generate_embeddings(
626628
class IndexType(Enum):
627629
ELASTICSEARCH = "elasticsearch"
628630
POSTGRES = "postgres"
631+
PINECONE = "pinecone"
629632

630633

631634
@step(enable_cache=False)
@@ -635,27 +638,20 @@ def index_generator(
635638
) -> None:
636639
"""Generates an index for the given documents.
637640
638-
This function creates a database connection, installs the pgvector extension if not already installed,
639-
creates an embeddings table if it doesn't exist, and inserts the embeddings and document metadata into the table.
640-
It then calculates the index parameters according to best practices and creates an index on the embeddings
641-
using the cosine distance measure.
642-
643641
Args:
644-
documents (str): A JSON string containing the Document objects with generated embeddings.
645-
index_type (IndexType): The type of index to use. Defaults to Elasticsearch.
646-
647-
Raises:
648-
Exception: If an error occurs during the index generation.
642+
documents (str): JSON string containing the documents to index.
643+
index_type (IndexType, optional): Type of index to generate. Defaults to IndexType.POSTGRES.
649644
"""
650-
try:
651-
if index_type == IndexType.ELASTICSEARCH:
652-
_index_generator_elastic(documents)
653-
else:
654-
_index_generator_postgres(documents)
645+
if index_type == IndexType.ELASTICSEARCH:
646+
_index_generator_elastic(documents)
647+
elif index_type == IndexType.POSTGRES:
648+
_index_generator_postgres(documents)
649+
elif index_type == IndexType.PINECONE:
650+
_index_generator_pinecone(documents)
651+
else:
652+
raise ValueError(f"Unknown index type: {index_type}")
655653

656-
except Exception as e:
657-
logger.error(f"Error in index_generator: {e}")
658-
raise
654+
_log_metadata(index_type)
659655

660656

661657
def _index_generator_elastic(documents: str) -> None:
@@ -826,6 +822,73 @@ def _index_generator_postgres(documents: str) -> None:
826822
conn.close()
827823

828824

825+
def _index_generator_pinecone(documents: str) -> None:
826+
"""Generates a Pinecone index for the given documents.
827+
828+
Args:
829+
documents (str): JSON string containing the documents to index.
830+
"""
831+
client = Client()
832+
pinecone_api_key = client.get_secret(SECRET_NAME_PINECONE).secret_values["pinecone_api_key"]
833+
index_name = client.get_secret(SECRET_NAME_PINECONE).secret_values.get("pinecone_index", "zenml-docs")
834+
835+
# Initialize Pinecone
836+
pc = Pinecone(api_key=pinecone_api_key)
837+
838+
# Create index if it doesn't exist
839+
if index_name not in pc.list_indexes().names():
840+
pc.create_index(
841+
name=index_name,
842+
dimension=EMBEDDING_DIMENSIONALITY,
843+
metric="cosine",
844+
spec=ServerlessSpec(
845+
cloud="aws",
846+
region="us-east-1"
847+
)
848+
)
849+
850+
# Get the index
851+
index = pc.Index(index_name)
852+
853+
# Load documents
854+
docs = json.loads(documents)
855+
856+
# Batch size for upserting vectors
857+
batch_size = 100
858+
batch = []
859+
860+
for doc in docs:
861+
# Create a unique ID for the document
862+
doc_id = hashlib.sha256(
863+
f"{doc['filename']}:{doc['parent_section']}:{doc['page_content']}".encode()
864+
).hexdigest()
865+
866+
# Create vector record
867+
vector_record = {
868+
"id": doc_id,
869+
"values": doc["embedding"],
870+
"metadata": {
871+
"filename": doc["filename"],
872+
"parent_section": doc["parent_section"],
873+
"url": doc["url"],
874+
"page_content": doc["page_content"],
875+
"token_count": doc["token_count"]
876+
}
877+
}
878+
batch.append(vector_record)
879+
880+
# Upsert batch when it reaches the batch size
881+
if len(batch) >= batch_size:
882+
index.upsert(vectors=batch)
883+
batch = []
884+
885+
# Upsert any remaining vectors
886+
if batch:
887+
index.upsert(vectors=batch)
888+
889+
logger.info(f"Successfully indexed {len(docs)} documents to Pinecone index '{index_name}'")
890+
891+
829892
def _log_metadata(index_type: IndexType) -> None:
830893
"""Log metadata about the indexing process."""
831894
prompt = """
@@ -848,9 +911,8 @@ def _log_metadata(index_type: IndexType) -> None:
848911
"api_key": "*********",
849912
}
850913
store_name = "elasticsearch"
851-
else:
914+
elif index_type == IndexType.POSTGRES:
852915
store_name = "pgvector"
853-
854916
connection_details = {
855917
"user": client.get_secret(SECRET_NAME).secret_values[
856918
"supabase_user"
@@ -864,6 +926,12 @@ def _log_metadata(index_type: IndexType) -> None:
864926
],
865927
"dbname": "postgres",
866928
}
929+
elif index_type == IndexType.PINECONE:
930+
store_name = "pinecone"
931+
connection_details = {
932+
"api_key": "**********",
933+
"environment": client.get_secret(SECRET_NAME_PINECONE).secret_values["pinecone_env"],
934+
}
867935

868936
log_metadata(
869937
metadata={

llm-complete-guide/utils/llm_utils.py

Lines changed: 116 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
from zenml.client import Client
2727

2828
from utils.openai_utils import get_openai_api_key
29-
29+
import pinecone
30+
from pinecone import Pinecone
3031
# Configure logging levels for specific modules
3132
logging.getLogger("pytorch").setLevel(logging.CRITICAL)
3233
logging.getLogger("sentence-transformers").setLevel(logging.CRITICAL)
@@ -37,7 +38,7 @@
3738
logging.getLogger().setLevel(logging.ERROR)
3839

3940
import re
40-
from typing import List, Tuple
41+
from typing import List, Tuple, Optional
4142

4243
import litellm
4344
import numpy as np
@@ -49,6 +50,7 @@
4950
OPENAI_MODEL,
5051
SECRET_NAME,
5152
SECRET_NAME_ELASTICSEARCH,
53+
SECRET_NAME_PINECONE,
5254
ZENML_CHATBOT_MODEL_NAME,
5355
ZENML_CHATBOT_MODEL_VERSION,
5456
)
@@ -277,6 +279,20 @@ def get_db_conn() -> connection:
277279
raise
278280

279281

282+
def get_pinecone_client() -> pinecone.Index:
283+
"""Get a Pinecone index client.
284+
285+
Returns:
286+
pinecone.Index: A Pinecone index client.
287+
"""
288+
client = Client()
289+
pinecone_api_key = client.get_secret(SECRET_NAME_PINECONE).secret_values["pinecone_api_key"]
290+
index_name = client.get_secret(SECRET_NAME_PINECONE).secret_values.get("pinecone_index", "zenml-docs")
291+
292+
pc = Pinecone(api_key=pinecone_api_key)
293+
return pc.Index(index_name)
294+
295+
280296
def get_topn_similar_docs_pgvector(
281297
query_embedding: List[float],
282298
conn: psycopg2.extensions.connection,
@@ -384,39 +400,89 @@ def get_topn_similar_docs_elasticsearch(
384400
return results
385401

386402

387-
def get_topn_similar_docs(
403+
def get_topn_similar_docs_pinecone(
388404
query_embedding: List[float],
389-
conn: psycopg2.extensions.connection = None,
390-
es_client: Elasticsearch = None,
405+
pinecone_index: pinecone.Index,
391406
n: int = 5,
392407
include_metadata: bool = False,
393408
only_urls: bool = False,
394409
) -> List[Tuple]:
395-
"""Fetches the top n most similar documents to the given query embedding from the database.
410+
"""Get the top N most similar documents from Pinecone.
396411
397412
Args:
398-
query_embedding (list): The query embedding to compare against.
399-
conn (psycopg2.extensions.connection): The database connection object.
400-
n (int, optional): The number of similar documents to fetch. Defaults to
401-
5.
402-
include_metadata (bool, optional): Whether to include metadata in the
403-
results. Defaults to False.
413+
query_embedding (List[float]): The query embedding vector.
414+
pinecone_index (pinecone.Index): The Pinecone index client.
415+
n (int, optional): Number of similar documents to return. Defaults to 5.
416+
include_metadata (bool, optional): Whether to include metadata in results. Defaults to False.
417+
only_urls (bool, optional): Whether to return only URLs. Defaults to False.
404418
405419
Returns:
406-
list: A list of tuples containing the content and metadata (if include_metadata is True) of the top n most similar documents.
420+
List[Tuple]: List of tuples containing document content and similarity scores.
407421
"""
408-
if conn is None and es_client is None:
409-
raise ValueError("Either conn or es_client must be provided")
422+
# Query the index
423+
results = pinecone_index.query(
424+
vector=query_embedding,
425+
top_k=n,
426+
include_metadata=True
427+
)
410428

411-
if conn is not None:
412-
return get_topn_similar_docs_pgvector(
413-
query_embedding, conn, n, include_metadata, only_urls
414-
)
429+
# Process results
430+
similar_docs = []
431+
for match in results.matches:
432+
score = match.score
433+
metadata = match.metadata
434+
435+
if only_urls:
436+
similar_docs.append((metadata["url"], score))
437+
else:
438+
content = metadata["page_content"]
439+
if include_metadata:
440+
content = f"{metadata['filename']} - {metadata['parent_section']}: {content}"
441+
similar_docs.append((content, score))
442+
443+
return similar_docs
444+
445+
446+
def get_topn_similar_docs(
447+
query_embedding: List[float],
448+
conn: Optional[psycopg2.extensions.connection] = None,
449+
es_client: Optional[Elasticsearch] = None,
450+
pinecone_index: Optional[pinecone.Index] = None,
451+
n: int = 5,
452+
include_metadata: bool = False,
453+
only_urls: bool = False,
454+
) -> List[Tuple]:
455+
"""Get the top N most similar documents from the vector store.
456+
457+
Args:
458+
query_embedding (List[float]): The query embedding vector.
459+
conn (Optional[psycopg2.extensions.connection], optional): PostgreSQL connection. Defaults to None.
460+
es_client (Optional[Elasticsearch], optional): Elasticsearch client. Defaults to None.
461+
pinecone_index (Optional[pinecone.Index], optional): Pinecone index client. Defaults to None.
462+
n (int, optional): Number of similar documents to return. Defaults to 5.
463+
include_metadata (bool, optional): Whether to include metadata in results. Defaults to False.
464+
only_urls (bool, optional): Whether to return only URLs. Defaults to False.
415465
466+
Returns:
467+
List[Tuple]: List of tuples containing document content and similarity scores.
468+
469+
Raises:
470+
ValueError: If no valid vector store client is provided.
471+
"""
416472
if es_client is not None:
417473
return get_topn_similar_docs_elasticsearch(
418474
query_embedding, es_client, n, include_metadata, only_urls
419475
)
476+
elif conn is not None:
477+
return get_topn_similar_docs_pgvector(
478+
query_embedding, conn, n, include_metadata, only_urls
479+
)
480+
elif pinecone_index is not None:
481+
return get_topn_similar_docs_pinecone(
482+
query_embedding, pinecone_index, n, include_metadata, only_urls
483+
)
484+
else:
485+
raise ValueError("No valid vector store client provided")
420486

421487

422488
def get_completion_from_messages(
@@ -525,32 +591,46 @@ def process_input_with_retrieval(
525591
str: The processed output.
526592
"""
527593
delimiter = "```"
528-
es_client = None
529-
conn = None
594+
# Get embeddings for the query
595+
query_embedding = get_embeddings(input)
530596

531-
vector_store_name = find_vectorstore_name()
532-
if vector_store_name == "pgvector":
533-
conn = get_db_conn()
534-
else:
597+
# Get similar documents based on the vector store being used
598+
vector_store = find_vectorstore_name()
599+
if vector_store == "elasticsearch":
535600
es_client = get_es_client()
601+
similar_docs = get_topn_similar_docs(
602+
query_embedding=query_embedding,
603+
es_client=es_client,
604+
n=n_items_retrieved,
605+
include_metadata=True,
606+
)
607+
elif vector_store == "pinecone":
608+
pinecone_index = get_pinecone_client()
609+
similar_docs = get_topn_similar_docs(
610+
query_embedding=query_embedding,
611+
pinecone_index=pinecone_index,
612+
n=n_items_retrieved,
613+
include_metadata=True,
614+
)
615+
else: # pgvector
616+
conn = get_db_conn()
617+
similar_docs = get_topn_similar_docs(
618+
query_embedding=query_embedding,
619+
conn=conn,
620+
n=n_items_retrieved,
621+
include_metadata=True,
622+
)
623+
conn.close()
536624

537-
# Step 1: Get documents related to the user input from database
538-
related_docs = get_topn_similar_docs(
539-
get_embeddings(input),
540-
conn=conn,
541-
es_client=es_client,
542-
n=n_items_retrieved,
543-
include_metadata=use_reranking,
544-
)
545-
625+
# Rerank documents if enabled
546626
if use_reranking:
547627
# Rerank the documents based on the input
548628
# and take the top 5 only
549629
context_content = [
550-
doc[0] for doc in rerank_documents(input, related_docs)[:5]
630+
doc[0] for doc in rerank_documents(input, similar_docs)[:5]
551631
]
552632
else:
553-
context_content = [doc[0] for doc in related_docs[:5]]
633+
context_content = [doc[0] for doc in similar_docs[:5]]
554634

555635
# Step 2: Get completion from OpenAI API
556636
# Set system message to help set appropriate tone and context for model

0 commit comments

Comments
 (0)