|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "[](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/experimental/merge-namespaces/merge-namespaces.ipynb) [](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/experimental/merge-namespaces/merge-namespaces.ipynb)" |
| 8 | + ] |
| 9 | + }, |
| 10 | + { |
| 11 | + "cell_type": "markdown", |
| 12 | + "metadata": {}, |
| 13 | + "source": [ |
| 14 | + "# Merging Namespaces in a Pinecone Index" |
| 15 | + ] |
| 16 | + }, |
| 17 | + { |
| 18 | + "cell_type": "markdown", |
| 19 | + "metadata": {}, |
| 20 | + "source": [ |
| 21 | + "This notebook queries vectors out of two namespaces `ns1` and `ns2` and upserts them to a new namespace named `merged`.\n", |
| 22 | + "\n", |
| 23 | + "Please note this code is **experimental** and not guaranteed to work by Pinecone. Test thoroughly before using in production." |
| 24 | + ] |
| 25 | + }, |
| 26 | + { |
| 27 | + "cell_type": "code", |
| 28 | + "execution_count": null, |
| 29 | + "metadata": {}, |
| 30 | + "outputs": [], |
| 31 | + "source": [ |
| 32 | + "%pip install -qU pinecone-notebooks pinecone-client[grpc]" |
| 33 | + ] |
| 34 | + }, |
| 35 | + { |
| 36 | + "cell_type": "code", |
| 37 | + "execution_count": null, |
| 38 | + "metadata": {}, |
| 39 | + "outputs": [], |
| 40 | + "source": [ |
| 41 | + "from pinecone.grpc import PineconeGRPC as Pinecone\n", |
| 42 | + "from pinecone import ServerlessSpec\n", |
| 43 | + "import os\n", |
| 44 | + "\n", |
| 45 | + "# The generated API key is available in the PINECONE_API_KEY environment variable\n", |
| 46 | + "api_key = os.environ.get('PINECONE_API_KEY')\n", |
| 47 | + "\n", |
| 48 | + "# Use the API key to initialize the Pinecone client\n", |
| 49 | + "pc = Pinecone(api_key=api_key)\n", |
| 50 | + "\n", |
| 51 | + "# Connect to your index\n", |
| 52 | + "index_name = \"namespace-test\" # replace with the correct index name\n", |
| 53 | + "index = pc.Index(index_name)" |
| 54 | + ] |
| 55 | + }, |
| 56 | + { |
| 57 | + "cell_type": "code", |
| 58 | + "execution_count": null, |
| 59 | + "metadata": {}, |
| 60 | + "outputs": [], |
| 61 | + "source": [ |
| 62 | + "import random\n", |
| 63 | + "\n", |
| 64 | + "dimension = index.describe_index_stats()['dimension']\n", |
| 65 | + "# Function to fetch all vectors from a namespace\n", |
| 66 | + "def fetch_all_vectors(namespace):\n", |
| 67 | + " count = index.describe_index_stats()['namespaces'][namespace]['vector_count']\n", |
| 68 | + " random_vecs = [random.random() for _ in range(dimension)]\n", |
| 69 | + " response = index.query(\n", |
| 70 | + " namespace=namespace,\n", |
| 71 | + " vector=random_vecs,\n", |
| 72 | + " top_k=count,\n", |
| 73 | + " include_values=True\n", |
| 74 | + " )\n", |
| 75 | + " return response['matches']\n", |
| 76 | + "\n", |
| 77 | + "# Fetch vectors from ns1 and ns2\n", |
| 78 | + "vectors_ns1 = fetch_all_vectors(\"ns1\")\n", |
| 79 | + "vectors_ns2 = fetch_all_vectors(\"ns2\")" |
| 80 | + ] |
| 81 | + }, |
| 82 | + { |
| 83 | + "cell_type": "code", |
| 84 | + "execution_count": null, |
| 85 | + "metadata": {}, |
| 86 | + "outputs": [], |
| 87 | + "source": [ |
| 88 | + "# Convert fetched vectors to the required upsert format\n", |
| 89 | + "def format_vectors_for_upsert(fetched_vectors):\n", |
| 90 | + " return [{\"id\": match['id'], \"values\": match['values']} for match in fetched_vectors]\n", |
| 91 | + "\n", |
| 92 | + "formatted_vectors_ns1 = format_vectors_for_upsert(vectors_ns1)\n", |
| 93 | + "formatted_vectors_ns2 = format_vectors_for_upsert(vectors_ns2)\n", |
| 94 | + "\n", |
| 95 | + "print(f\"Preparing to upsert {len(formatted_vectors_ns1)} vectors from ns1 and \\\n", |
| 96 | + "{len(formatted_vectors_ns2)} vectors from ns2\")" |
| 97 | + ] |
| 98 | + }, |
| 99 | + { |
| 100 | + "cell_type": "markdown", |
| 101 | + "metadata": {}, |
| 102 | + "source": [ |
| 103 | + "Note that any vectors with overlapping IDs between `ns1` and `ns2` will be overwritten by the `ns2` upsert." |
| 104 | + ] |
| 105 | + }, |
| 106 | + { |
| 107 | + "cell_type": "code", |
| 108 | + "execution_count": null, |
| 109 | + "metadata": {}, |
| 110 | + "outputs": [], |
| 111 | + "source": [ |
| 112 | + "from itertools import islice\n", |
| 113 | + "\n", |
| 114 | + "# Upsert vectors in batches of 100\n", |
| 115 | + "def chunks(data, size=100):\n", |
| 116 | + " it = iter(data)\n", |
| 117 | + " for chunk in iter(lambda: tuple(islice(it, size)), ()):\n", |
| 118 | + " yield chunk\n", |
| 119 | + "\n", |
| 120 | + "# Upsert vectors into the merged namespace\n", |
| 121 | + "target_namespace = 'merged'\n", |
| 122 | + "for batch in chunks(formatted_vectors_ns1):\n", |
| 123 | + " index.upsert(vectors=batch, namespace=target_namespace)\n", |
| 124 | + "\n", |
| 125 | + "for batch in chunks(formatted_vectors_ns2):\n", |
| 126 | + " index.upsert(vectors=batch, namespace=target_namespace)\n", |
| 127 | + "\n", |
| 128 | + "print(f\"Upserted {len(formatted_vectors_ns1)} vectors from ns1 and \\\n", |
| 129 | + "{len(formatted_vectors_ns2)} vectors from ns2 into {target_namespace}\")" |
| 130 | + ] |
| 131 | + } |
| 132 | + ], |
| 133 | + "metadata": { |
| 134 | + "language_info": { |
| 135 | + "name": "python" |
| 136 | + } |
| 137 | + }, |
| 138 | + "nbformat": 4, |
| 139 | + "nbformat_minor": 2 |
| 140 | +} |
0 commit comments