Skip to content

Commit 7ac600b

Browse files
authored
feat: OpenAI vector search example (#9)
* feat (open-ai vector search): add example * feat (open-ai vector search): add some comments * feat (open-ai vector search): rename file
1 parent 11d7bf4 commit 7ac600b

File tree

4 files changed

+371
-3
lines changed

4 files changed

+371
-3
lines changed

.gitignore

Lines changed: 174 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# Created by https://www.toptal.com/developers/gitignore/api/go
2-
# Edit at https://www.toptal.com/developers/gitignore?templates=go
1+
# Created by https://www.toptal.com/developers/gitignore/api/go,python
2+
# Edit at https://www.toptal.com/developers/gitignore?templates=go,python
33

44
### Go ###
55
# If you prefer the allow list template instead of the deny list, see community template:
@@ -24,4 +24,175 @@ vendor/
2424
# Go workspace file
2525
go.work
2626

27-
# End of https://www.toptal.com/developers/gitignore/api/go
27+
### Python ###
28+
# Byte-compiled / optimized / DLL files
29+
__pycache__/
30+
*.py[cod]
31+
*$py.class
32+
33+
# C extensions
34+
35+
# Distribution / packaging
36+
.Python
37+
build/
38+
develop-eggs/
39+
dist/
40+
downloads/
41+
eggs/
42+
.eggs/
43+
lib/
44+
lib64/
45+
parts/
46+
sdist/
47+
var/
48+
wheels/
49+
share/python-wheels/
50+
*.egg-info/
51+
.installed.cfg
52+
*.egg
53+
MANIFEST
54+
55+
# PyInstaller
56+
# Usually these files are written by a python script from a template
57+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
58+
*.manifest
59+
*.spec
60+
61+
# Installer logs
62+
pip-log.txt
63+
pip-delete-this-directory.txt
64+
65+
# Unit test / coverage reports
66+
htmlcov/
67+
.tox/
68+
.nox/
69+
.coverage
70+
.coverage.*
71+
.cache
72+
nosetests.xml
73+
coverage.xml
74+
*.cover
75+
*.py,cover
76+
.hypothesis/
77+
.pytest_cache/
78+
cover/
79+
80+
# Translations
81+
*.mo
82+
*.pot
83+
84+
# Django stuff:
85+
*.log
86+
local_settings.py
87+
db.sqlite3
88+
db.sqlite3-journal
89+
90+
# Flask stuff:
91+
instance/
92+
.webassets-cache
93+
94+
# Scrapy stuff:
95+
.scrapy
96+
97+
# Sphinx documentation
98+
docs/_build/
99+
100+
# PyBuilder
101+
.pybuilder/
102+
target/
103+
104+
# Jupyter Notebook
105+
.ipynb_checkpoints
106+
107+
# IPython
108+
profile_default/
109+
ipython_config.py
110+
111+
# pyenv
112+
# For a library or package, you might want to ignore these files since the code is
113+
# intended to run in multiple environments; otherwise, check them in:
114+
# .python-version
115+
116+
# pipenv
117+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
118+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
119+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
120+
# install all needed dependencies.
121+
#Pipfile.lock
122+
123+
# poetry
124+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
125+
# This is especially recommended for binary packages to ensure reproducibility, and is more
126+
# commonly ignored for libraries.
127+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
128+
#poetry.lock
129+
130+
# pdm
131+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
132+
#pdm.lock
133+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
134+
# in version control.
135+
# https://pdm.fming.dev/#use-with-ide
136+
.pdm.toml
137+
138+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
139+
__pypackages__/
140+
141+
# Celery stuff
142+
celerybeat-schedule
143+
celerybeat.pid
144+
145+
# SageMath parsed files
146+
*.sage.py
147+
148+
# Environments
149+
.env
150+
.venv
151+
env/
152+
venv/
153+
ENV/
154+
env.bak/
155+
venv.bak/
156+
157+
# Spyder project settings
158+
.spyderproject
159+
.spyproject
160+
161+
# Rope project settings
162+
.ropeproject
163+
164+
# mkdocs documentation
165+
/site
166+
167+
# mypy
168+
.mypy_cache/
169+
.dmypy.json
170+
dmypy.json
171+
172+
# Pyre type checker
173+
.pyre/
174+
175+
# pytype static type analyzer
176+
.pytype/
177+
178+
# Cython debug symbols
179+
cython_debug/
180+
181+
# PyCharm
182+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
183+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
184+
# and can be added to the global gitignore or merged into this file. For a more nuclear
185+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
186+
#.idea/
187+
188+
### Python Patch ###
189+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
190+
poetry.toml
191+
192+
# ruff
193+
.ruff_cache/
194+
195+
# LSP config files
196+
pyrightconfig.json
197+
198+
# End of https://www.toptal.com/developers/gitignore/api/go,python

search-blogs-by-vector/README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Example: Search Blogs by Dragonfly Vector Search
2+
3+
In this example, we will build a search engine for Dragonfly blogs using Dragonfly Vector Search with the OpenAI API.
4+
5+
## Local Setup
6+
7+
- The following steps assume that you are working in the root directory of this example (dragonfly-examples/search-blogs-by-vector).
8+
- Start with a clean Python environment:
9+
10+
```shell
11+
python3 -m venv venv
12+
```
13+
14+
- After the virtual environment is created, activate it:
15+
16+
```shell
17+
# Windows
18+
venv\Scripts\activate
19+
```
20+
21+
```shell
22+
# Linux / macOS
23+
source venv/bin/activate
24+
```
25+
26+
- Install Jupyter Notebook and other dependencies:
27+
28+
```shell
29+
(venv)$> pip install notebook==7.0.6
30+
(venv)$> pip install openai==1.3.7
31+
(venv)$> pip install pandas==2.1.3
32+
(venv)$> pip install numpy==1.26.2
33+
(venv)$> pip install redis==5.0.1
34+
```
35+
36+
## Run Dragonfly & Jupyter Notebook
37+
38+
- Run a Dragonfly instance using Docker (v1.13 or above) locally:
39+
40+
```bash
41+
docker run -p 6379:6379 --ulimit memlock=-1 ghcr.io/dragonflydb/dragonfly:v1.13.0-ubuntu
42+
```
43+
44+
- Run Jupyter Notebook server locally:
45+
46+
```shell
47+
(venv)$> jupyter notebook
48+
```
49+
50+
- Run cells in `search-blogs-by-vector.ipynb` from the Jupyter Notebook Web UI at `http://localhost:8888/`

search-blogs-by-vector/blog-with-embeddings.csv

Lines changed: 32 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "4f51b900-ac24-4cf0-ae2b-6fc6f3ce5b8f",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import pandas as pd\n",
11+
"\n",
12+
"posts = pd.read_csv('blog-with-embeddings.csv', delimiter=',', quotechar='\"', converters={'embedding': pd.eval})\n",
13+
"posts.head()"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": null,
19+
"id": "5fa8c493-ee34-4c18-baf1-a2f0a975a25a",
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"import redis\n",
24+
"from redis.commands.search.field import (TextField, VectorField)\n",
25+
"from redis.commands.search.indexDefinition import (IndexDefinition, IndexType)\n",
26+
"\n",
27+
"# Create a Redis client communicating with the local Dragonfly instance.\n",
28+
"client = redis.Redis()\n",
29+
"\n",
30+
"# Create an index 'posts', using the TEXT type for 'title', and the VECTOR type for 'embedding'.\n",
31+
"client.ft(\"posts\").create_index(\n",
32+
" fields = [TextField(\"title\"), VectorField(\"embedding\", \"FLAT\", {\"DIM\": \"1536\"})],\n",
33+
" definition = IndexDefinition(prefix=[\"post:\"], index_type=IndexType.HASH)\n",
34+
")"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"id": "b526ef68-a4eb-4e02-92f3-d494320ed660",
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"import numpy as np\n",
45+
"\n",
46+
"# Store blog post data as HASH values in Dragonfly.\n",
47+
"# Since the index is created for all keys with the 'post:' prefix, these documents will be indexed.\n",
48+
"for i, post in posts.iterrows():\n",
49+
" embedding_bytes = np.array(post['embedding']).astype(np.float32).tobytes()\n",
50+
" client.hset(f\"post:{i}\", mapping={**post, 'embedding': embedding_bytes})"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": null,
56+
"id": "6ac91061-a9a5-4224-8cf6-0a7d3f4e6411",
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"from redis.commands.search.query import Query\n",
61+
"import openai\n",
62+
"\n",
63+
"# How to get an OpenAI API key: https://platform.openai.com/docs/api-reference/introduction\n",
64+
"# NOTE: Do not share your API key with anyone, do not commit it to git, do not hardcode it in your code.\n",
65+
"openai.api_key = \"{YOUR_OPENAI_API_KEY}\"\n",
66+
"EMBEDDING_MODEL = \"text-embedding-ada-002\"\n",
67+
"\n",
68+
"# Create a vector for a query string using the OpenAI API.\n",
69+
"query = \"How to switch from a multi node redis setup to Dragonfly\"\n",
70+
"query_vec = openai.embeddings.create(input=query, model=EMBEDDING_MODEL).data[0].embedding\n",
71+
"\n",
72+
"# Compose a search query for Dragonfly.\n",
73+
"query_expr = Query(\"*=>[KNN 3 @embedding $query_vector AS vector_score]\").return_fields(\"title\", \"vector_score\").paging(0, 30)\n",
74+
"params = {\"query_vector\": np.array(query_vec).astype(dtype=np.float32).tobytes()}\n",
75+
"\n",
76+
"# Search by query.\n",
77+
"docs = client.ft(\"posts\").search(query_expr, params).docs\n",
78+
"for i, doc in enumerate(docs):\n",
79+
" print(i+1, doc.vector_score, doc.title)"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"id": "50bd113c-c48a-4554-bc32-6847dbe4395a",
86+
"metadata": {},
87+
"outputs": [],
88+
"source": [
89+
"# Get index information.\n",
90+
"client.ft(\"posts\").info()"
91+
]
92+
}
93+
],
94+
"metadata": {
95+
"kernelspec": {
96+
"display_name": "Python 3 (ipykernel)",
97+
"language": "python",
98+
"name": "python3"
99+
},
100+
"language_info": {
101+
"codemirror_mode": {
102+
"name": "ipython",
103+
"version": 3
104+
},
105+
"file_extension": ".py",
106+
"mimetype": "text/x-python",
107+
"name": "python",
108+
"nbconvert_exporter": "python",
109+
"pygments_lexer": "ipython3",
110+
"version": "3.11.4"
111+
}
112+
},
113+
"nbformat": 4,
114+
"nbformat_minor": 5
115+
}

0 commit comments

Comments
 (0)