Skip to content

Commit 463026a

Browse files
authored
[Release] Version 0.1.1 (google#134)
What's changed: - Update the version of ChromaDB to 4.13. - Re-generate the `poetry.lock` file. - Update Python scripts to migrate ChromaDB from 3.21 to 4.13 (See https://docs.trychroma.com/migration). - Updated README to clarify when to delete an existing vector database. - Update the chatbot UI template to use "PaLM" to refer to the AI model. Sept 28, 2023
1 parent e437473 commit 463026a

File tree

8 files changed

+1790
-1975
lines changed

8 files changed

+1790
-1975
lines changed

demos/palm/python/docs-agent/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -487,15 +487,16 @@ To convert Markdown files to plain text files:
487487

488488
### 2. Populate a new vector database
489489

490-
**Important**: If the `vector_stores/chroma` directory already exists, delete
491-
(or move) the `chroma` directory before populating a new vector database. Also,
492-
if the Docs Agent chat app is already running using this `chroma` directory, shut down
493-
the app before deleting the directory.
494-
495490
Once you have plain text files processed and stored in the `output_path` directory,
496491
you can run the `populat_vector_database.py` script to populate a vector database
497492
with the contents of the plain text files and their embeddings (and metadata).
498493

494+
**Important**: For a clean setup, if the `vector_stores/chroma` directory already
495+
exists, delete (or move) the `chroma` directory before populating a new vector
496+
database. (Otherwise, new entries will be added to your existing vector database.)
497+
Also, if the Docs Agent chat app is already running using this `chroma` directory,
498+
shut down the app before deleting the directory.
499+
499500
To populate a new vector database:
500501

501502
1. Go to the Docs Agent project directory, for example:

demos/palm/python/docs-agent/chatbot/templates/chatui/result.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ <h4 id="rewrite-question-header">Question:</h4>
3838
<span id="rewrite-question-span">
3939
<p>{{ question | replace("+", " ") | replace("%3F", "?")}}</p>
4040
</span>
41-
<h4 id="rewrite-response-header">Bard's response:</h4>
41+
<h4 id="rewrite-response-header">PaLM's response:</h4>
4242
<span id="rewrite-original-response-span">
4343
{{ response_in_html | safe }}
4444
</span>

demos/palm/python/docs-agent/chroma.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,7 @@ class Chroma:
4242
"""Chroma wrapper"""
4343

4444
def __init__(self, chroma_dir) -> None:
45-
self.client = chromadb.Client(
46-
Settings(
47-
chroma_db_impl="duckdb+parquet",
48-
persist_directory=chroma_dir,
49-
)
50-
)
45+
self.client = chromadb.PersistentClient(path=chroma_dir)
5146

5247
def list_collections(self):
5348
return self.client.list_collections()

demos/palm/python/docs-agent/poetry.lock

Lines changed: 1764 additions & 1952 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

demos/palm/python/docs-agent/pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "docs-agent"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
description = ""
55
authors = ["Docs Agent contributors"]
66
readme = "README.md"
@@ -11,7 +11,7 @@ rich = "^13.3.5"
1111
Markdown = "^3.4.3"
1212
beautifulsoup4 = "^4.12.2"
1313
protobuf = ">=3.20"
14-
chromadb = "^0.3.21"
14+
chromadb = "==0.4.13"
1515
sentence-transformers = "^2.2.2"
1616
ratelimit = "^2.2.1"
1717
absl-py = "^1.4.0"
@@ -21,6 +21,7 @@ google-generativeai = "^0.1.0"
2121
grpcio = "^1.57.0"
2222
grpcio-tools = "^1.57.0"
2323
uuid = "^1.30"
24+
pytz = ">=2020.1"
2425

2526
[tool.poetry.group.dev.dependencies]
2627
ipython = "^8.13.2"

demos/palm/python/docs-agent/scripts/markdown_to_plain_text.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ def process_page_and_section_titles(markdown_text):
153153
page_title = data["title"]
154154
markdown_text = data.content
155155
metadata = data.metadata
156+
if "URL" in data:
157+
final_url = data["URL"]
158+
metadata["URL"] = final_url
156159
for line in markdown_text.split("\n"):
157160
new_line = ""
158161
skip_this_line = False
@@ -173,7 +176,7 @@ def process_page_and_section_titles(markdown_text):
173176
# Detect Markdown heading levels
174177
if heading == "#":
175178
page_title = captured_title.strip()
176-
metadata = {"title": page_title}
179+
metadata["title"] = page_title
177180
subsection_title = ""
178181
section_title = ""
179182
elif heading == "##":

demos/palm/python/docs-agent/scripts/populate_vector_database.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,7 @@
108108
MODEL = os.path.join(BASE_DIR, "models/all-mpnet-base-v2")
109109
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL)
110110

111-
chroma_client = chromadb.Client(
112-
Settings(chroma_db_impl="duckdb+parquet", persist_directory=LOCAL_VECTOR_DB_DIR)
113-
)
114-
111+
chroma_client = chromadb.PersistentClient(path=LOCAL_VECTOR_DB_DIR)
115112

116113
# Create embed function for PaLM
117114
# API call limit to 5 qps
@@ -175,6 +172,8 @@ def embed_function(texts: Documents) -> Embeddings:
175172
# Using the full path avoids mismatches
176173
full_file_name = FULL_BASE_DIR + clean_filename + file
177174
metadata_dict_extra = {}
175+
# Flag to see if there is a predefined URL from frontmatter
176+
final_url = False
178177
# Reads the metadata associated with files
179178
for key in index:
180179
if full_file_name in index[key]:
@@ -197,6 +196,10 @@ def embed_function(texts: Documents) -> Embeddings:
197196
index[key][full_file_name]["metadata"], delimiter="_"
198197
)
199198
metadata_dict_extra = dict(metadata_dict_extra)
199+
# Extracts user specified URL
200+
if "URL" in metadata_dict_extra:
201+
final_url = True
202+
final_url_value = metadata_dict_extra["URL"]
200203
else:
201204
metadata_dict_extra = {}
202205
if "UUID" in index[key][full_file_name]:
@@ -216,6 +219,9 @@ def embed_function(texts: Documents) -> Embeddings:
216219
# Remove .md at the end of URLs by default.
217220
match3 = re.search(r"(.*)\.md$", url)
218221
url = match3[1]
222+
# Replaces the URL if it comes from frontmatter
223+
if (final_url):
224+
url = final_url_value
219225
# Creates a dictionary with basic metadata values
220226
# (i.e. source, URL, and md_hash)
221227
metadata_dict_main = {
@@ -287,7 +293,6 @@ def embed_function(texts: Documents) -> Embeddings:
287293
print("[Warning] Empty file!")
288294
print("")
289295
auto.close()
290-
chroma_client.persist()
291296
# results = collection.query(
292297
# query_texts=["What are some differences between apples and oranges?"],
293298
# n_results=3,

demos/palm/python/docs-agent/scripts/test_vector_database.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,7 @@ def embed_palm(texts: Documents) -> Embeddings:
8383
ai_console = Console(width=160)
8484
ai_console.rule("Fold")
8585

86-
chroma_client = chromadb.Client(
87-
Settings(chroma_db_impl="duckdb+parquet", persist_directory=LOCAL_VECTOR_DB_DIR)
88-
)
86+
chroma_client = chromadb.PersistentClient(path=LOCAL_VECTOR_DB_DIR)
8987

9088
if EMBEDDINGS_TYPE == "PALM":
9189
PALM_EMBEDDING_MODEL = "models/embedding-gecko-001"

0 commit comments

Comments
 (0)