TileDB-Inc
diff --git a/‎apis/python/examples/object_api/image_search_from_directory.ipynb‎
Lines changed: 358 additions & 0 deletions b/‎apis/python/examples/object_api/image_search_from_directory.ipynb‎
Lines changed: 358 additions & 0 deletions
diff --git a/‎apis/python/examples/object_api/image_search.ipynb‎ renamed to ‎apis/python/examples/object_api/image_search_from_tiledb.ipynb‎
Lines changed: 15 additions & 15 deletions b/‎apis/python/examples/object_api/image_search.ipynb‎ renamed to ‎apis/python/examples/object_api/image_search_from_tiledb.ipynb‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎apis/python/examples/object_api/text_search_documents.ipynb‎
Lines changed: 176 additions & 0 deletions b/‎apis/python/examples/object_api/text_search_documents.ipynb‎
Lines changed: 176 additions & 0 deletions
@@ -29,10 +29,10 @@
     "classes = np.array([\"dandelion\", \"daisy\", \"tulips\", \"sunflowers\", \"roses\"])\n",
     "\n",
     "dataset = \"tf_flowers\"\n",
-    "base_uri = f\"/tmp/{dataset}_demo\"\n",
+    "base_uri = f\"/tmp/{dataset}_tiledb_demo\"\n",
     "config = {}\n",
-    "image_array_uri = f\"{base_uri}/tf_flowers_sparse\"\n",
-    "metadata_array_uri = f\"{base_uri}/tf_flowers_metadata_sparse\"\n",
+    "image_array_uri = f\"{base_uri}/tf_flowers\"\n",
+    "metadata_array_uri = f\"{base_uri}/tf_flowers_metadata\"\n",
     "index_uri = f\"{base_uri}/index\"\n",
     "vfs = tiledb.VFS(config=config)"
    ]
@@ -116,14 +116,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "16/16 [==============================] - 29s 2s/step\n",
-      "16/16 [==============================] - 29s 2s/step\n",
-      "16/16 [==============================] - 30s 2s/step\n",
-      "16/16 [==============================] - 30s 2s/step\n",
-      "6/6 [==============================] - 3s 522ms/step\n",
-      "16/16 [==============================] - 21s 1s/step\n",
-      "16/16 [==============================] - 21s 1s/step\n",
-      "16/16 [==============================] - 20s 1s/step\n"
+      "16/16 [==============================] - 27s 2s/step\n",
+      "16/16 [==============================] - 28s 2s/step\n",
+      "16/16 [==============================] - 28s 2s/step\n",
+      "16/16 [==============================] - 28s 2s/step\n",
+      "6/6 [==============================] - 3s 527ms/step\n",
+      "16/16 [==============================] - 18s 1s/step\n",
+      "16/16 [==============================] - 19s 1s/step\n",
+      "16/16 [==============================] - 18s 1s/step\n"
      ]
     }
    ],
@@ -187,8 +187,8 @@
     "    display(PIL.Image.fromarray(np.reshape(images[\"image\"][image_id, related_image_id], images[\"shape\"][image_id, related_image_id])))\n",
     "index = object_index.ObjectIndex(index_uri, config=config)\n",
     "\n",
-    "rid = random.randint(0,3600)\n",
-    "# rid = 1279\n",
+    "# rid = random.randint(0,3600)\n",
+    "rid = 1279\n",
     "with tiledb.open(image_array_uri, mode='r', config=config) as A:\n",
     "    query_image = A[rid]\n",
     "\n",
@@ -216,7 +216,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1/1 [==============================] - 1s 887ms/step\n"
+      "1/1 [==============================] - 0s 494ms/step\n"
      ]
     },
     {
@@ -305,7 +305,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1/1 [==============================] - 0s 81ms/step\n"
+      "1/1 [==============================] - 0s 59ms/step\n"
      ]
     },
     {
 
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Document text search\n",
+    "\n",
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "import os\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"]=\"true\"\n",
+    "import tiledb\n",
+    "from tiledb.vector_search.object_api import object_index\n",
+    "from tiledb.vector_search.object_readers import DirectoryTextReader\n",
+    "from tiledb.vector_search.embeddings import SentenceTransformersEmbedding\n",
+    "\n",
+    "dataset = \"documents\"\n",
+    "base_uri = f\"/tmp/{dataset}_demo\"\n",
+    "documents_uri = f\"{base_uri}/documents\"\n",
+    "index_uri = f\"{base_uri}/index\"\n",
+    "config = {}\n",
+    "vfs = tiledb.VFS(config=config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create vector search index\n",
+    "\n",
+    "We point to a document directory that contains multiple files of different types (.pdf, .docx, .html, .jpg, .png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['blogs', '.DS_Store', 'img', 'TileDB_Vector_Search_in_LangChain.docx', 'TileDB_Vector_Search_Updates.docx', 'VLDB17_TileDB.pdf']\n",
+      "['TileDB_Vector_Search_101.html', '.DS_Store']\n",
+      "['.DS_Store', 'TileDB_embedded_arch.png', 'TileDB_cloud_arch.jpg']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(os.listdir(documents_uri))\n",
+    "print(os.listdir(f\"{documents_uri}/blogs\"))\n",
+    "print(os.listdir(f\"{documents_uri}/img\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a vector index using an open source text embedding function from HuggingFace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if vfs.is_dir(index_uri):\n",
+    "    vfs.remove_dir(index_uri)\n",
+    "vfs.create_dir(index_uri)\n",
+    "\n",
+    "reader = DirectoryTextReader(\n",
+    "        uri=documents_uri, \n",
+    "        glob=\"**/[!.]*\",\n",
+    "        config=config,\n",
+    "        text_splitter=\"RecursiveCharacterTextSplitter\",\n",
+    "        text_splitter_kwargs={\"chunk_size\":1000}\n",
+    "    )\n",
+    "embedding = SentenceTransformersEmbedding(model_name_or_path='BAAI/bge-small-en-v1.5', dimensions=384)\n",
+    "index = object_index.create(\n",
+    "    uri=index_uri,\n",
+    "    index_type=\"IVF_FLAT\",\n",
+    "    object_reader=reader,\n",
+    "    embedding=embedding,\n",
+    "    config=config,\n",
+    ")\n",
+    "index.update_index(\n",
+    "    files_per_partition=100,\n",
+    "    config=config,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Query\n",
+    "\n",
+    "Text similarity query with file type restrict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File: file:///tmp/documents_demo/documents/VLDB17_TileDB.pdf\n",
+      "Text: 359\n",
+      "\n",
+      "6.2 Sparse Arrays\n",
+      "\n",
+      "We next focus on sparse arrays, comparing TileDB with Vertica+Z (gzip-compressed and following SRAM [19]) and SciDB on the AIS dataset. HDF5 is not optimized for sparse arrays, thus we omit it from these experiments.\n"
+     ]
+    }
+   ],
+   "source": [
+    "def display_results(results):\n",
+    "    file_paths = results[\"file_path\"][0]\n",
+    "    texts = results[\"text\"][0]\n",
+    "    i = 0\n",
+    "    for text in texts:\n",
+    "        print(f\"File: {file_paths[i]}\")\n",
+    "        print(f\"Text: {text}\")\n",
+    "        i += 1\n",
+    "\n",
+    "def pdf_filter_fn(row):\n",
+    "    return \".pdf\" in row['file_path']\n",
+    "\n",
+    "distances, _, results = index.query(\n",
+    "                            {\"text\": [\"sparse arrays\"]}, \n",
+    "                            metadata_df_filter_fn=pdf_filter_fn,\n",
+    "                            k=1,\n",
+    "                            nprobe=index.index.partitions,\n",
+    "                            return_objects=False,\n",
+    "                            return_metadata=True,\n",
+    "                        )\n",
+    "display_results(results)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tiledb_vs_10_arm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}