Merge pull request #361 from junefish/merge-namespaces

zackproser · web-flow · commit c9a2cc955a46 · 2024-06-18T11:14:44.000-04:00
Create colab notebook for merging two namespaces
diff --git a/learn/experimental/merge-namespaces/merge-namespaces.ipynb b/learn/experimental/merge-namespaces/merge-namespaces.ipynb
@@ -0,0 +1,140 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/experimental/merge-namespaces/merge-namespaces.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/experimental/merge-namespaces/merge-namespaces.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Merging Namespaces in a Pinecone Index"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook queries vectors out of two namespaces `ns1` and `ns2` and upserts them to a new namespace named `merged`.\n",
+    "\n",
+    "Please note this code is **experimental** and not guaranteed to work by Pinecone. Test thoroughly before using in production."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -qU pinecone-notebooks pinecone-client[grpc]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pinecone.grpc import PineconeGRPC as Pinecone\n",
+    "from pinecone import ServerlessSpec\n",
+    "import os\n",
+    "\n",
+    "# The generated API key is available in the PINECONE_API_KEY environment variable\n",
+    "api_key = os.environ.get('PINECONE_API_KEY')\n",
+    "\n",
+    "# Use the API key to initialize the Pinecone client\n",
+    "pc = Pinecone(api_key=api_key)\n",
+    "\n",
+    "# Connect to your index\n",
+    "index_name = \"namespace-test\" # replace with the correct index name\n",
+    "index = pc.Index(index_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "dimension = index.describe_index_stats()['dimension']\n",
+    "# Function to fetch all vectors from a namespace\n",
+    "def fetch_all_vectors(namespace):\n",
+    "    count = index.describe_index_stats()['namespaces'][namespace]['vector_count']\n",
+    "    random_vecs = [random.random() for _ in range(dimension)]\n",
+    "    response = index.query(\n",
+    "        namespace=namespace,\n",
+    "        vector=random_vecs,\n",
+    "        top_k=count,\n",
+    "        include_values=True\n",
+    "    )\n",
+    "    return response['matches']\n",
+    "\n",
+    "# Fetch vectors from ns1 and ns2\n",
+    "vectors_ns1 = fetch_all_vectors(\"ns1\")\n",
+    "vectors_ns2 = fetch_all_vectors(\"ns2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert fetched vectors to the required upsert format\n",
+    "def format_vectors_for_upsert(fetched_vectors):\n",
+    "    return [{\"id\": match['id'], \"values\": match['values']} for match in fetched_vectors]\n",
+    "\n",
+    "formatted_vectors_ns1 = format_vectors_for_upsert(vectors_ns1)\n",
+    "formatted_vectors_ns2 = format_vectors_for_upsert(vectors_ns2)\n",
+    "\n",
+    "print(f\"Preparing to upsert {len(formatted_vectors_ns1)} vectors from ns1 and \\\n",
+    "{len(formatted_vectors_ns2)} vectors from ns2\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that any vectors with overlapping IDs between `ns1` and `ns2` will be overwritten by the `ns2` upsert."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import islice\n",
+    "\n",
+    "# Upsert vectors in batches of 100\n",
+    "def chunks(data, size=100):\n",
+    "    it = iter(data)\n",
+    "    for chunk in iter(lambda: tuple(islice(it, size)), ()):\n",
+    "        yield chunk\n",
+    "\n",
+    "# Upsert vectors into the merged namespace\n",
+    "target_namespace = 'merged'\n",
+    "for batch in chunks(formatted_vectors_ns1):\n",
+    "    index.upsert(vectors=batch, namespace=target_namespace)\n",
+    "\n",
+    "for batch in chunks(formatted_vectors_ns2):\n",
+    "    index.upsert(vectors=batch, namespace=target_namespace)\n",
+    "\n",
+    "print(f\"Upserted {len(formatted_vectors_ns1)} vectors from ns1 and \\\n",
+    "{len(formatted_vectors_ns2)} vectors from ns2 into {target_namespace}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}