Add Google Vertex AI Vector Search Hybrid Search Documentation (#29064)

lspataroG · web-flow · commit a49448a7c9ef · 2025-01-07T10:29:03.000-05:00
Add examples in the documentation to use hybrid search in Vertex AI [Vector Search](langchain-ai/langchain-google#628)
diff --git a/docs/docs/integrations/vectorstores/google_vertex_ai_vector_search.ipynb b/docs/docs/integrations/vectorstores/google_vertex_ai_vector_search.ipynb
@@ -70,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "dfa92a08",
    "metadata": {},
    "outputs": [],
@@ -91,12 +91,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "c795913e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "embedding_model = VertexAIEmbeddings(model_name=\"textembedding-gecko@003\")"
+    "embedding_model = VertexAIEmbeddings(model_name=\"text-embedding-005\")"
    ]
   },
   {
@@ -722,7 +722,139 @@
    "cell_type": "markdown",
    "id": "31222b03",
    "metadata": {},
-   "source": []
+   "source": [
+    "## Hybrid Search"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8a308f2",
+   "metadata": {},
+   "source": [
+    "Vector Search supports hybrid search, a popular architecture pattern in information retrieval (IR) that combines both semantic search and keyword search (also called token-based search). With hybrid search, developers can take advantage of the best of the two approaches, effectively providing higher search quality.\n",
+    "Click [here](https://cloud.google.com/vertex-ai/docs/vector-search/about-hybrid-search) to learn more.\n",
+    "\n",
+    "In order to use hybrid search, we need to fit a sparse embedding vectorizer and handle the embeddings outside of the Vector Search integration.\n",
+    "An example of sparse embedding vectorizer is sklearn TfidfVectorizer but other techniques can be used, for instance BM25."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "e319402d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define some sample data\n",
+    "texts = [\n",
+    "    \"The cat sat on\",\n",
+    "    \"the mat.\",\n",
+    "    \"I like to\",\n",
+    "    \"eat pizza for\",\n",
+    "    \"dinner.\",\n",
+    "    \"The sun sets\",\n",
+    "    \"in the west.\",\n",
+    "]\n",
+    "\n",
+    "# optional IDs\n",
+    "ids = [\"i_\" + str(i + 1) for i in range(len(texts))]\n",
+    "\n",
+    "# optional metadata\n",
+    "metadatas = [{\"my_metadata\": i} for i in range(len(texts))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14efefc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "vectorizer.fit(texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c7206c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Utility function to transform text into a TF-IDF Sparse Vector\n",
+    "def get_sparse_embedding(tfidf_vectorizer, text):\n",
+    "    tfidf_vector = tfidf_vectorizer.transform([text])\n",
+    "    values = []\n",
+    "    dims = []\n",
+    "    for i, tfidf_value in enumerate(tfidf_vector.data):\n",
+    "        values.append(float(tfidf_value))\n",
+    "        dims.append(int(tfidf_vector.indices[i]))\n",
+    "    return {\"values\": values, \"dimensions\": dims}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "0dc5b782",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# semantic (dense) embeddings\n",
+    "embeddings = embedding_model.embed_documents(texts)\n",
+    "# tfidf (sparse) embeddings\n",
+    "sparse_embeddings = [get_sparse_embedding(vectorizer, x) for x in texts]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a353679",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sparse_embeddings[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2623cad9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add the dense and sparse embeddings in Vector Search\n",
+    "\n",
+    "vector_store.add_texts_with_embeddings(\n",
+    "    texts=texts,\n",
+    "    embeddings=embeddings,\n",
+    "    sparse_embeddings=sparse_embeddings,\n",
+    "    ids=ids,\n",
+    "    metadatas=metadatas,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29885e38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run hybrid search\n",
+    "query = \"the cat\"\n",
+    "embedding = embedding_model.embed_query(query)\n",
+    "sparse_embedding = get_sparse_embedding(vectorizer, query)\n",
+    "\n",
+    "vector_store.similarity_search_by_vector_with_score(\n",
+    "    embedding=embedding,\n",
+    "    sparse_embedding=sparse_embedding,\n",
+    "    k=5,\n",
+    "    rrf_ranking_alpha=0.7,  # 0.7 weight to dense and 0.3 weight to sparse\n",
+    ")"
+   ]
   }
  ],
  "metadata": {
@@ -733,7 +865,7 @@
    "uri": "gcr.io/deeplearning-platform-release/base-cpu:m107"
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "langchain-google-community-3Os9yvMd-py3.10",
    "language": "python",
    "name": "python3"
   },
@@ -747,7 +879,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,