Skip to content

Commit 5108199

Browse files
authored
Merge branch 'master' into python-double-underscore
2 parents 852d2b6 + d9a1323 commit 5108199

File tree

14 files changed

+483
-3
lines changed

14 files changed

+483
-3
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Embeddings and Vector Databases With ChromaDB
2+
3+
Supporting code for the Real Python tutorial [Embeddings and Vector Databases With ChromaDB](https://realpython.com/chromadb-vector-database/).
4+
5+
To run the code in this tutorial, you should have `numpy`, `spacy`, `sentence-transformers`, `chromadb`, `polars`, `more-itertools`, and `openai` installed in your environment.
6+
7+
You can install the dependencies manually, or by running:
8+
9+
```
10+
(venv) $ python -m pip install -r requirements.txt
11+
```
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import pathlib
2+
3+
import polars as pl
4+
5+
6+
def prepare_car_reviews_data(
7+
data_path: pathlib.Path, vehicle_years: list[int] = [2017]
8+
):
9+
"""Prepare the car reviews dataset for ChromaDB"""
10+
11+
# Define the schema to ensure proper data types are enforced
12+
dtypes = {
13+
"": pl.Int64,
14+
"Review_Date": pl.Utf8,
15+
"Author_Name": pl.Utf8,
16+
"Vehicle_Title": pl.Utf8,
17+
"Review_Title": pl.Utf8,
18+
"Review": pl.Utf8,
19+
"Rating": pl.Float64,
20+
}
21+
22+
# Scan the car reviews dataset(s)
23+
car_reviews = pl.scan_csv(data_path, dtypes=dtypes)
24+
25+
# Extract the vehicle title and year as new columns
26+
# Filter on selected years
27+
car_review_db_data = (
28+
car_reviews.with_columns(
29+
[
30+
(
31+
pl.col("Vehicle_Title")
32+
.str.split(by=" ")
33+
.list.get(0)
34+
.cast(pl.Int64)
35+
).alias("Vehicle_Year"),
36+
(pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias(
37+
"Vehicle_Model"
38+
),
39+
]
40+
)
41+
.filter(pl.col("Vehicle_Year").is_in(vehicle_years))
42+
.select(
43+
[
44+
"Review_Title",
45+
"Review",
46+
"Rating",
47+
"Vehicle_Year",
48+
"Vehicle_Model",
49+
]
50+
)
51+
.sort(["Vehicle_Model", "Rating"])
52+
.collect()
53+
)
54+
55+
# Create ids, documents, and metadatas data in the format chromadb expects
56+
ids = [f"review{i}" for i in range(car_review_db_data.shape[0])]
57+
documents = car_review_db_data["Review"].to_list()
58+
metadatas = car_review_db_data.drop("Review").to_dicts()
59+
60+
return {"ids": ids, "documents": documents, "metadatas": metadatas}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import pathlib
2+
3+
import chromadb
4+
from chromadb.utils import embedding_functions
5+
from more_itertools import batched
6+
7+
8+
def build_chroma_collection(
9+
chroma_path: pathlib.Path,
10+
collection_name: str,
11+
embedding_func_name: str,
12+
ids: list[str],
13+
documents: list[str],
14+
metadatas: list[dict],
15+
distance_func_name: str = "cosine",
16+
):
17+
"""Create a ChromaDB collection"""
18+
19+
chroma_client = chromadb.PersistentClient(chroma_path)
20+
21+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
22+
model_name=embedding_func_name
23+
)
24+
25+
collection = chroma_client.create_collection(
26+
name=collection_name,
27+
embedding_function=embedding_func,
28+
metadata={"hnsw:space": distance_func_name},
29+
)
30+
31+
document_indices = list(range(len(documents)))
32+
33+
for batch in batched(document_indices, 166):
34+
start_idx = batch[0]
35+
end_idx = batch[-1]
36+
37+
collection.add(
38+
ids=ids[start_idx:end_idx],
39+
documents=documents[start_idx:end_idx],
40+
metadatas=metadatas[start_idx:end_idx],
41+
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"openai-secret-key": "your-api-key"
3+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import numpy as np
2+
3+
4+
def compute_cosine_similarity(u: np.ndarray, v: np.ndarray) -> float:
5+
"""Compute the cosine similarity between two vectors"""
6+
7+
return (u @ v) / (np.linalg.norm(u) * np.linalg.norm(v))
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import chromadb
2+
from chromadb.utils import embedding_functions
3+
4+
from car_data_etl import prepare_car_reviews_data
5+
from chroma_utils import build_chroma_collection
6+
7+
DATA_PATH = "data/archive/*"
8+
CHROMA_PATH = "car_review_embeddings"
9+
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
10+
COLLECTION_NAME = "car_reviews"
11+
12+
chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)
13+
14+
build_chroma_collection(
15+
CHROMA_PATH,
16+
COLLECTION_NAME,
17+
EMBEDDING_FUNC_NAME,
18+
chroma_car_reviews_dict["ids"],
19+
chroma_car_reviews_dict["documents"],
20+
chroma_car_reviews_dict["metadatas"],
21+
)
22+
23+
client = chromadb.PersistentClient(CHROMA_PATH)
24+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
25+
model_name=EMBEDDING_FUNC_NAME
26+
)
27+
collection = client.get_collection(
28+
name=COLLECTION_NAME, embedding_function=embedding_func
29+
)
30+
31+
great_reviews = collection.query(
32+
query_texts=[
33+
"Find me some positive reviews that discuss the car's performance"
34+
],
35+
n_results=5,
36+
include=["documents", "distances", "metadatas"],
37+
)
38+
39+
print(great_reviews["documents"][0][0])
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import numpy as np
2+
3+
# Create vectors with NumPy
4+
vector1 = np.array([1, 0])
5+
vector2 = np.array([0, 1])
6+
print(vector1)
7+
print(vector2)
8+
9+
v1 = np.array([1, 0])
10+
v2 = np.array([0, 1])
11+
v3 = np.array([np.sqrt(2), np.sqrt(2)])
12+
13+
# Dimension
14+
print(v1.shape)
15+
16+
# Magnitude
17+
print(np.sqrt(np.sum(v1**2)))
18+
print(np.linalg.norm(v1))
19+
print(np.linalg.norm(v3))
20+
21+
# Dot product
22+
print(np.sum(v1 * v2))
23+
print(v1 @ v3)
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import json
2+
import os
3+
4+
import chromadb
5+
import openai
6+
from chromadb.utils import embedding_functions
7+
8+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
9+
10+
DATA_PATH = "data/archive/*"
11+
CHROMA_PATH = "car_review_embeddings"
12+
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
13+
COLLECTION_NAME = "car_reviews"
14+
15+
with open("config.json", "r") as json_file:
16+
config_data = json.load(json_file)
17+
18+
openai.api_key = config_data.get("openai-secret-key")
19+
20+
client = chromadb.PersistentClient(CHROMA_PATH)
21+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
22+
model_name=EMBEDDING_FUNC_NAME
23+
)
24+
25+
collection = client.get_collection(
26+
name=COLLECTION_NAME, embedding_function=embedding_func
27+
)
28+
29+
context = """
30+
You are a customer success employee at a large
31+
car dealership. Use the following car reviews
32+
to answer questions: {}
33+
"""
34+
35+
question = """
36+
What's the key to great customer satisfaction
37+
based on detailed positive reviews?
38+
"""
39+
40+
good_reviews = collection.query(
41+
query_texts=[question],
42+
n_results=10,
43+
include=["documents"],
44+
where={"Rating": {"$gte": 3}},
45+
)
46+
47+
reviews_str = ",".join(good_reviews["documents"][0])
48+
49+
good_review_summaries = openai.ChatCompletion.create(
50+
model="gpt-3.5-turbo",
51+
messages=[
52+
{"role": "system", "content": context.format(reviews_str)},
53+
{"role": "user", "content": question},
54+
],
55+
temperature=0,
56+
n=1,
57+
)
58+
59+
reviews_str = ",".join(good_reviews["documents"][0])
60+
61+
print("Good reviews: ")
62+
print(reviews_str)
63+
print("###########################################")
64+
65+
good_review_summaries = openai.ChatCompletion.create(
66+
model="gpt-3.5-turbo",
67+
messages=[
68+
{"role": "system", "content": context.format(reviews_str)},
69+
{"role": "user", "content": question},
70+
],
71+
temperature=0,
72+
n=1,
73+
)
74+
75+
print("AI-Generated summary of good reviews: ")
76+
print(good_review_summaries["choices"][0]["message"]["content"])
77+
print("###########################################")
78+
79+
80+
context = """
81+
You are a customer success employee at a large car dealership.
82+
Use the following car reivews to answer questions: {}
83+
"""
84+
question = """
85+
Which of these poor reviews has the worst implications about
86+
our dealership? Explain why.
87+
"""
88+
89+
poor_reviews = collection.query(
90+
query_texts=[question],
91+
n_results=5,
92+
include=["documents"],
93+
where={"Rating": {"$lte": 3}},
94+
)
95+
96+
reviews_str = ",".join(poor_reviews["documents"][0])
97+
98+
print("Worst reviews: ")
99+
print(poor_reviews["documents"][0][0])
100+
print("###########################################")
101+
102+
poor_review_analysis = openai.ChatCompletion.create(
103+
model="gpt-3.5-turbo",
104+
messages=[
105+
{"role": "system", "content": context.format(reviews_str)},
106+
{"role": "user", "content": question},
107+
],
108+
temperature=0,
109+
n=1,
110+
)
111+
112+
print("AI-Generated summary of the single worst review: ")
113+
print(poor_review_analysis["choices"][0]["message"]["content"])
114+
print("###########################################")
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
aiohttp==3.8.6
2+
aiosignal==1.3.1
3+
annotated-types==0.6.0
4+
anyio==3.7.1
5+
async-timeout==4.0.3
6+
attrs==23.1.0
7+
backoff==2.2.1
8+
bcrypt==4.0.1
9+
blis==0.7.11
10+
catalogue==2.0.10
11+
certifi==2023.7.22
12+
charset-normalizer==3.3.0
13+
chroma-hnswlib==0.7.3
14+
chromadb==0.4.14
15+
click==8.1.7
16+
cloudpathlib==0.16.0
17+
coloredlogs==15.0.1
18+
confection==0.1.3
19+
cymem==2.0.8
20+
fastapi==0.104.0
21+
filelock==3.12.4
22+
flatbuffers==23.5.26
23+
frozenlist==1.4.0
24+
fsspec==2023.9.2
25+
grpcio==1.59.0
26+
h11==0.14.0
27+
httptools==0.6.1
28+
huggingface-hub==0.17.3
29+
humanfriendly==10.0
30+
idna==3.4
31+
importlib-resources==6.1.0
32+
Jinja2==3.1.2
33+
joblib==1.3.2
34+
langcodes==3.3.0
35+
MarkupSafe==2.1.3
36+
monotonic==1.6
37+
more-itertools==10.1.0
38+
mpmath==1.3.0
39+
multidict==6.0.4
40+
murmurhash==1.0.10
41+
networkx==3.2
42+
nltk==3.8.1
43+
numpy==1.26.1
44+
onnxruntime==1.16.1
45+
openai==0.28.1
46+
overrides==7.4.0
47+
packaging==23.2
48+
Pillow==10.1.0
49+
polars==0.19.9
50+
posthog==3.0.2
51+
preshed==3.0.9
52+
protobuf==4.24.4
53+
pulsar-client==3.3.0
54+
pydantic==2.4.2
55+
pydantic_core==2.10.1
56+
PyPika==0.48.9
57+
python-dateutil==2.8.2
58+
python-dotenv==1.0.0
59+
PyYAML==6.0.1
60+
regex==2023.10.3
61+
requests==2.31.0
62+
safetensors==0.4.0
63+
scikit-learn==1.3.1
64+
scipy==1.11.3
65+
sentence-transformers==2.2.2
66+
sentencepiece==0.1.99
67+
six==1.16.0
68+
smart-open==6.4.0
69+
sniffio==1.3.0
70+
spacy==3.7.2
71+
spacy-legacy==3.0.12
72+
spacy-loggers==1.0.5
73+
srsly==2.4.8
74+
starlette==0.27.0
75+
sympy==1.12
76+
thinc==8.2.1
77+
threadpoolctl==3.2.0
78+
tokenizers==0.14.1
79+
torch==2.1.0
80+
torchvision==0.16.0
81+
tqdm==4.66.1
82+
transformers==4.34.1
83+
typer==0.9.0
84+
typing_extensions==4.8.0
85+
urllib3==2.0.7
86+
uvicorn==0.23.2
87+
uvloop==0.18.0
88+
wasabi==1.1.2
89+
watchfiles==0.21.0
90+
weasel==0.3.3
91+
websockets==11.0.3
92+
yarl==1.9.2

0 commit comments

Comments
 (0)