Skip to content

Commit d1cf7f6

Browse files
Add directory document readers for vector search Object API (#226)
This is adding directory readers that can be used to read documents from a directory structure. We introduce 2 readers: 1. Text reader: Extracts text from different types of documents based on their mime type. This can be used for several file types (.pdf, .docx, .html, .jpg, .png, etc.). This is implemented by reusing the LangChain document read utilities. 2. Image document reader: Read images from image files in a directory. The readers also work for vfs directories. We also introduce multiple notebook examples in `apis/python/examples/object_api/`
1 parent f021d44 commit d1cf7f6

File tree

9 files changed

+1226
-71
lines changed

9 files changed

+1226
-71
lines changed

apis/python/examples/object_api/image_search_from_directory.ipynb

Lines changed: 358 additions & 0 deletions
Large diffs are not rendered by default.

apis/python/examples/object_api/image_search.ipynb renamed to apis/python/examples/object_api/image_search_from_tiledb.ipynb

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@
2929
"classes = np.array([\"dandelion\", \"daisy\", \"tulips\", \"sunflowers\", \"roses\"])\n",
3030
"\n",
3131
"dataset = \"tf_flowers\"\n",
32-
"base_uri = f\"/tmp/{dataset}_demo\"\n",
32+
"base_uri = f\"/tmp/{dataset}_tiledb_demo\"\n",
3333
"config = {}\n",
34-
"image_array_uri = f\"{base_uri}/tf_flowers_sparse\"\n",
35-
"metadata_array_uri = f\"{base_uri}/tf_flowers_metadata_sparse\"\n",
34+
"image_array_uri = f\"{base_uri}/tf_flowers\"\n",
35+
"metadata_array_uri = f\"{base_uri}/tf_flowers_metadata\"\n",
3636
"index_uri = f\"{base_uri}/index\"\n",
3737
"vfs = tiledb.VFS(config=config)"
3838
]
@@ -116,14 +116,14 @@
116116
"name": "stdout",
117117
"output_type": "stream",
118118
"text": [
119-
"16/16 [==============================] - 29s 2s/step\n",
120-
"16/16 [==============================] - 29s 2s/step\n",
121-
"16/16 [==============================] - 30s 2s/step\n",
122-
"16/16 [==============================] - 30s 2s/step\n",
123-
"6/6 [==============================] - 3s 522ms/step\n",
124-
"16/16 [==============================] - 21s 1s/step\n",
125-
"16/16 [==============================] - 21s 1s/step\n",
126-
"16/16 [==============================] - 20s 1s/step\n"
119+
"16/16 [==============================] - 27s 2s/step\n",
120+
"16/16 [==============================] - 28s 2s/step\n",
121+
"16/16 [==============================] - 28s 2s/step\n",
122+
"16/16 [==============================] - 28s 2s/step\n",
123+
"6/6 [==============================] - 3s 527ms/step\n",
124+
"16/16 [==============================] - 18s 1s/step\n",
125+
"16/16 [==============================] - 19s 1s/step\n",
126+
"16/16 [==============================] - 18s 1s/step\n"
127127
]
128128
}
129129
],
@@ -187,8 +187,8 @@
187187
" display(PIL.Image.fromarray(np.reshape(images[\"image\"][image_id, related_image_id], images[\"shape\"][image_id, related_image_id])))\n",
188188
"index = object_index.ObjectIndex(index_uri, config=config)\n",
189189
"\n",
190-
"rid = random.randint(0,3600)\n",
191-
"# rid = 1279\n",
190+
"# rid = random.randint(0,3600)\n",
191+
"rid = 1279\n",
192192
"with tiledb.open(image_array_uri, mode='r', config=config) as A:\n",
193193
" query_image = A[rid]\n",
194194
"\n",
@@ -216,7 +216,7 @@
216216
"name": "stdout",
217217
"output_type": "stream",
218218
"text": [
219-
"1/1 [==============================] - 1s 887ms/step\n"
219+
"1/1 [==============================] - 0s 494ms/step\n"
220220
]
221221
},
222222
{
@@ -305,7 +305,7 @@
305305
"name": "stdout",
306306
"output_type": "stream",
307307
"text": [
308-
"1/1 [==============================] - 0s 81ms/step\n"
308+
"1/1 [==============================] - 0s 59ms/step\n"
309309
]
310310
},
311311
{
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Document text search\n",
8+
"\n",
9+
"## Setup"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 1,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import warnings\n",
19+
"warnings.filterwarnings(\"ignore\")\n",
20+
"import os\n",
21+
"os.environ[\"TOKENIZERS_PARALLELISM\"]=\"true\"\n",
22+
"import tiledb\n",
23+
"from tiledb.vector_search.object_api import object_index\n",
24+
"from tiledb.vector_search.object_readers import DirectoryTextReader\n",
25+
"from tiledb.vector_search.embeddings import SentenceTransformersEmbedding\n",
26+
"\n",
27+
"dataset = \"documents\"\n",
28+
"base_uri = f\"/tmp/{dataset}_demo\"\n",
29+
"documents_uri = f\"{base_uri}/documents\"\n",
30+
"index_uri = f\"{base_uri}/index\"\n",
31+
"config = {}\n",
32+
"vfs = tiledb.VFS(config=config)"
33+
]
34+
},
35+
{
36+
"cell_type": "markdown",
37+
"metadata": {},
38+
"source": [
39+
"# Create vector search index\n",
40+
"\n",
41+
"We point to a document directory that contains multiple files of different types (.pdf, .docx, .html, .jpg, .png)"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": 2,
47+
"metadata": {},
48+
"outputs": [
49+
{
50+
"name": "stdout",
51+
"output_type": "stream",
52+
"text": [
53+
"['blogs', '.DS_Store', 'img', 'TileDB_Vector_Search_in_LangChain.docx', 'TileDB_Vector_Search_Updates.docx', 'VLDB17_TileDB.pdf']\n",
54+
"['TileDB_Vector_Search_101.html', '.DS_Store']\n",
55+
"['.DS_Store', 'TileDB_embedded_arch.png', 'TileDB_cloud_arch.jpg']\n"
56+
]
57+
}
58+
],
59+
"source": [
60+
"print(os.listdir(documents_uri))\n",
61+
"print(os.listdir(f\"{documents_uri}/blogs\"))\n",
62+
"print(os.listdir(f\"{documents_uri}/img\"))"
63+
]
64+
},
65+
{
66+
"cell_type": "markdown",
67+
"metadata": {},
68+
"source": [
69+
"Create a vector index using an open source text embedding function from HuggingFace"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"if vfs.is_dir(index_uri):\n",
79+
" vfs.remove_dir(index_uri)\n",
80+
"vfs.create_dir(index_uri)\n",
81+
"\n",
82+
"reader = DirectoryTextReader(\n",
83+
" uri=documents_uri, \n",
84+
" glob=\"**/[!.]*\",\n",
85+
" config=config,\n",
86+
" text_splitter=\"RecursiveCharacterTextSplitter\",\n",
87+
" text_splitter_kwargs={\"chunk_size\":1000}\n",
88+
" )\n",
89+
"embedding = SentenceTransformersEmbedding(model_name_or_path='BAAI/bge-small-en-v1.5', dimensions=384)\n",
90+
"index = object_index.create(\n",
91+
" uri=index_uri,\n",
92+
" index_type=\"IVF_FLAT\",\n",
93+
" object_reader=reader,\n",
94+
" embedding=embedding,\n",
95+
" config=config,\n",
96+
")\n",
97+
"index.update_index(\n",
98+
" files_per_partition=100,\n",
99+
" config=config,\n",
100+
")"
101+
]
102+
},
103+
{
104+
"cell_type": "markdown",
105+
"metadata": {},
106+
"source": [
107+
"## Query\n",
108+
"\n",
109+
"Text similarity query with file type restrict"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": 4,
115+
"metadata": {},
116+
"outputs": [
117+
{
118+
"name": "stdout",
119+
"output_type": "stream",
120+
"text": [
121+
"File: file:///tmp/documents_demo/documents/VLDB17_TileDB.pdf\n",
122+
"Text: 359\n",
123+
"\n",
124+
"6.2 Sparse Arrays\n",
125+
"\n",
126+
"We next focus on sparse arrays, comparing TileDB with Vertica+Z (gzip-compressed and following SRAM [19]) and SciDB on the AIS dataset. HDF5 is not optimized for sparse arrays, thus we omit it from these experiments.\n"
127+
]
128+
}
129+
],
130+
"source": [
131+
"def display_results(results):\n",
132+
" file_paths = results[\"file_path\"][0]\n",
133+
" texts = results[\"text\"][0]\n",
134+
" i = 0\n",
135+
" for text in texts:\n",
136+
" print(f\"File: {file_paths[i]}\")\n",
137+
" print(f\"Text: {text}\")\n",
138+
" i += 1\n",
139+
"\n",
140+
"def pdf_filter_fn(row):\n",
141+
" return \".pdf\" in row['file_path']\n",
142+
"\n",
143+
"distances, _, results = index.query(\n",
144+
" {\"text\": [\"sparse arrays\"]}, \n",
145+
" metadata_df_filter_fn=pdf_filter_fn,\n",
146+
" k=1,\n",
147+
" nprobe=index.index.partitions,\n",
148+
" return_objects=False,\n",
149+
" return_metadata=True,\n",
150+
" )\n",
151+
"display_results(results)"
152+
]
153+
}
154+
],
155+
"metadata": {
156+
"kernelspec": {
157+
"display_name": "tiledb_vs_10_arm",
158+
"language": "python",
159+
"name": "python3"
160+
},
161+
"language_info": {
162+
"codemirror_mode": {
163+
"name": "ipython",
164+
"version": 3
165+
},
166+
"file_extension": ".py",
167+
"mimetype": "text/x-python",
168+
"name": "python",
169+
"nbconvert_exporter": "python",
170+
"pygments_lexer": "ipython3",
171+
"version": "3.9.18"
172+
}
173+
},
174+
"nbformat": 4,
175+
"nbformat_minor": 2
176+
}

0 commit comments

Comments
 (0)