diff --git a/python-recipes/RAG/01_redisvl.ipynb b/python-recipes/RAG/01_redisvl.ipynb
index cca9ae64..fef526d3 100644
--- a/python-recipes/RAG/01_redisvl.ipynb
+++ b/python-recipes/RAG/01_redisvl.ipynb
@@ -1,1684 +1,2113 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "R2-i8jBl9GRH"
- },
- "source": [
- "\n",
- "\n",
- "# RAG from scratch with the Redis Vector Library\n",
- "\n",
- "\n",
- "In this recipe we will cover the basic of the Redis Vector Library and build a basic RAG app from scratch.\n",
- "\n",
- "## Let's Begin!\n",
- "
\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "rT9HzsnQ1uiz"
- },
- "source": [
- "## Environment Setup\n",
- "\n",
- "### Pull Github Materials\n",
- "Because you are likely running this notebook in **Google Colab**, we need to first\n",
- "pull the necessary dataset and materials directly from GitHub.\n",
- "\n",
- "**If you are running this notebook locally**, FYI you may not need to perform this\n",
- "step at all."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "AJJ2UW6M1ui0",
- "outputId": "0f5773b7-a292-4ee6-f4bd-20dc40ca2aba"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Cloning into 'temp_repo'...\n",
- "remote: Enumerating objects: 384, done.\u001b[K\n",
- "remote: Counting objects: 100% (247/247), done.\u001b[K\n",
- "remote: Compressing objects: 100% (159/159), done.\u001b[K\n",
- "remote: Total 384 (delta 135), reused 153 (delta 74), pack-reused 137 (from 1)\u001b[K\n",
- "Receiving objects: 100% (384/384), 64.50 MiB | 8.97 MiB/s, done.\n",
- "Resolving deltas: 100% (159/159), done.\n"
- ]
- }
- ],
- "source": [
- "# NBVAL_SKIP\n",
- "!git clone https://github.com/redis-developer/redis-ai-resources.git temp_repo\n",
- "!mv temp_repo/python-recipes/RAG/resources .\n",
- "!rm -rf temp_repo"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Z67mf6T91ui2"
- },
- "source": [
- "### Install Python Dependencies"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "DgxBQFXQ1ui2",
- "outputId": "c3c399d6-e294-4a3a-a0a3-82d818509991"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/261.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m256.0/261.4 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.1/96.1 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m55.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m53.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m408.7/408.7 kB\u001b[0m \u001b[31m27.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.5/49.5 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25h"
- ]
- }
- ],
- "source": [
- "%pip install -q redis \"redisvl>=0.4.1\" langchain-community pypdf sentence-transformers langchain openai pandas"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "peC8ThuVJkD7"
- },
- "source": [
- "### Install Redis Stack\n",
- "\n",
- "Later in this tutorial, Redis will be used to store, index, and query vector\n",
- "embeddings created from PDF document chunks. **We need to make sure we have a Redis\n",
- "instance available.**"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "zMKHJ7oWJkD8"
- },
- "source": [
- "#### For Colab\n",
- "Use the shell script below to download, extract, and install [Redis Stack](https://redis.io/docs/getting-started/install-stack/) directly from the Redis package archive."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "c0d5lfNxJkD8",
- "outputId": "f96e72fa-b9f3-476f-bc9e-328bd30d1344"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb jammy main\n",
- "Starting redis-stack-server, database path /var/lib/redis-stack\n"
- ]
- }
- ],
- "source": [
- "# NBVAL_SKIP\n",
- "%%sh\n",
- "curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg\n",
- "echo \"deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main\" | sudo tee /etc/apt/sources.list.d/redis.list\n",
- "sudo apt-get update > /dev/null 2>&1\n",
- "sudo apt-get install redis-stack-server > /dev/null 2>&1\n",
- "redis-stack-server --daemonize yes"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2arb8Ic0JkD8"
- },
- "source": [
- "#### For Alternative Environments\n",
- "There are many ways to get the necessary redis-stack instance running\n",
- "1. On cloud, deploy a [FREE instance of Redis in the cloud](https://redis.com/try-free/). Or, if you have your\n",
- "own version of Redis Enterprise running, that works too!\n",
- "2. Per OS, [see the docs](https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/)\n",
- "3. With docker: `docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest`"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "DhP1w0R9JkD8"
- },
- "source": [
- "### Define the Redis Connection URL\n",
- "\n",
- "By default this notebook connects to the local instance of Redis Stack. **If you have your own Redis Enterprise instance** - replace REDIS_PASSWORD, REDIS_HOST and REDIS_PORT values with your own."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "id": "ggh5TzhkJkD9"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "\n",
- "# Replace values below with your own if using Redis Cloud instance\n",
- "REDIS_HOST = os.getenv(\"REDIS_HOST\", \"localhost\") # ex: \"redis-18374.c253.us-central1-1.gce.cloud.redislabs.com\"\n",
- "REDIS_PORT = os.getenv(\"REDIS_PORT\", \"6379\") # ex: 18374\n",
- "REDIS_PASSWORD = os.getenv(\"REDIS_PASSWORD\", \"\") # ex: \"1TNxTEdYRDgIDKM2gDfasupCADXXXX\"\n",
- "\n",
- "# If SSL is enabled on the endpoint, use rediss:// as the URL prefix\n",
- "REDIS_URL = f\"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "b3ErDmsIJkD9"
- },
- "source": [
- "## Simplified Vector Search with RedisVL"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "KrtWWU4I1ui3"
- },
- "source": [
- "### Dataset Preparation (PDF Documents)\n",
- "\n",
- "To best demonstrate Redis as a vector database layer, we will load a single\n",
- "financial (10k filings) doc and preprocess it using some helpers from LangChain:\n",
- "\n",
- "- `PyPDFLoader` is not the only document loader type that LangChain provides. Docs: https://python.langchain.com/docs/integrations/document_loaders/pypdfloader/\n",
- "- `RecursiveCharacterTextSplitter` is what we use to create smaller chunks of text from the doc. Docs: https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "uijl2qFH1ui3",
- "outputId": "a99b3fcb-7cfd-4dbd-f258-57779cfcae3c"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Listing available documents ... ['resources/nke-10k-2023.pdf', 'resources/amzn-10k-2023.pdf', 'resources/jnj-10k-2023.pdf', 'resources/aapl-10k-2023.pdf', 'resources/testset_15.csv', 'resources/retrieval_basic_rag_test.csv', 'resources/2022-chevy-colorado-ebrochure.pdf', 'resources/nvd-10k-2023.pdf', 'resources/testset.csv', 'resources/msft-10k-2023.pdf', 'resources/propositions.json', 'resources/generation_basic_rag_test.csv']\n"
- ]
- }
- ],
- "source": [
- "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
- "from langchain_community.document_loaders import PyPDFLoader\n",
- "\n",
- "# Load list of pdfs from a folder\n",
- "data_path = \"resources/\"\n",
- "docs = [os.path.join(data_path, file) for file in os.listdir(data_path)]\n",
- "\n",
- "print(\"Listing available documents ...\", docs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "anya8hVnT6K_",
- "outputId": "a8430acc-2e6d-45fd-fc8b-601fbbd8289b"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done preprocessing. Created 211 chunks of the original pdf resources/nke-10k-2023.pdf\n"
- ]
- }
- ],
- "source": [
- "# pick out the Nike doc for this exercise\n",
- "doc = [doc for doc in docs if \"nke\" in doc][0]\n",
- "\n",
- "# set up the file loader/extractor and text splitter to create chunks\n",
- "text_splitter = RecursiveCharacterTextSplitter(\n",
- " chunk_size=2500, chunk_overlap=0\n",
- ")\n",
- "loader = PyPDFLoader(doc, headers = None)\n",
- "\n",
- "# extract, load, and make chunks\n",
- "chunks = loader.load_and_split(text_splitter)\n",
- "\n",
- "print(\"Done preprocessing. Created\", len(chunks), \"chunks of the original pdf\", doc)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "fDN4XopTJkD9"
- },
- "source": [
- "### Text embedding generation with RedisVL\n",
- "RedisVL has built-in extensions and utilities to aid the GenAI development process. In the following snipit we utilize the HFTextVectorizer redisvl in tandem with the **all-MiniLM-L6-v2** class to generate vector embeddings for the chunks created above. These embeddings capture the \"meaning\" of the text so that we can retrieve the relevant chunks later when a user's query is semantically related."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 661,
- "referenced_widgets": [
- "cbd44245af844dca8e568691cc1c15c5",
- "3109d0d320274ad0bb941608ee3df5e3",
- "6c902ce903bb4e25a127ec277e2b2c45",
- "954b76e059024b15be48fb5064ab2fb7",
- "160c4567015f4b1bba43dc7e1e4712fb",
- "712fcb54fabc430c9567240a2ddd4a76",
- "f96ce89375924097ab9f4cd130fd7b41",
- "58c687581a8d4d3a828686cd066a32b3",
- "df2305a9a6634dffbc08567f62047b27",
- "218e8977786b42e1b825a14d44164d82",
- "8bc8cb91c6274c08a72c91c91dddf4ef",
- "abee8aeb772f48dab4661dca40277788",
- "300b9716084a4a24bf479ae7200b87d1",
- "ff76433f165146f0b39d2488a33b318e",
- "98fe1e1e066541ec942a05ec416fa53f",
- "be9c6f9905fd440884261e09367fe659",
- "9d7bd9a50eea407eb60c41c1534f295d",
- "968f389c21cf469daee8284a7b14c251",
- "39f7677d9d8a4bdf8f4eb4756fae3ed2",
- "959248b437054a43a0393c71a603b35f",
- "6b3711002db148f790eab617f7f40eb4",
- "5a3363012166483d90abb10b476772bf",
- "92e02308d4d94725b73cc324d8cd9906",
- "6fe679c08e2b46dd8657160d974912e0",
- "61fc922ce98c4fefbebe7bb6a8ee9317",
- "2cc139350de742989b6e24d70e490a54",
- "995465a251f64f7a9c1e5541a7f28d4d",
- "56b8c445444b4d39b2c9fb199586ff93",
- "5f2ad751dab24f6aaae736c01e582c14",
- "54331fe70c934a7894903d5ca7a960ce",
- "6270fcf4772f40d59a6f6842060f36a4",
- "14e24b722ecf47a49ebe42e8c3492c1e",
- "b5e36e428e3541fd8a237d0f28a023e1",
- "6aa3f285fd8a4a84882b7bece1b639ac",
- "d20425f4a0594c319bc51ee60d773f79",
- "a046d9ff7e1d4577ab28315d681ac36b",
- "c9468d94408a4d36a20eae07624a6a09",
- "902551f09b44499b8c8dd88bbdf50a4a",
- "5477b553050e42c0b8ed7c2c8c17c025",
- "fcbac845d7c24db6a85e82f190e69a75",
- "82f4af2b827c4d98a762c2e7ebd03d6e",
- "146de95acc214f60b854553ab983b7ae",
- "a356517795234ab6abb3ffd71b05f296",
- "1757bba5dca64bf3b7d359cd2537e9c5",
- "59d890877f8b4f7aa436fa4b82e4cf8d",
- "9a0acbad43204038b8ca4edeeb0e0d61",
- "38518362236e470898cdbfb48ee0d381",
- "9aac56d1808d490797bbb175c5afb226",
- "2f848e63b87847d1a299c04052d567d6",
- "52395bed9f6d455897d8d489e7dcb0d3",
- "4e2332a6f482448597a9d4988fec7cf6",
- "ac55276fbd5a4404ba065a19849119c5",
- "fae66f22c38247ad85078f6ad2530ced",
- "a3fcad6db08c4f07adf4ee817afce77a",
- "557fb6c9f787412a8bff6f4798087bb7",
- "a4c7c73d90cf44acb43740b223be8101",
- "010e7ce97cfb43f195d1dd1811584ea2",
- "484f1fc0b5844726b3ac203440ddbdc8",
- "9368d437c3534a33b0010ea77be8a5e2",
- "50c576ca5f914c65aeb5b7c03f4b0fa2",
- "80bcb933a16c40788a3ad354e545acfe",
- "2bfc17a97664452787740dc202eae370",
- "600f4d36b66d40ecb8353db981d0f1f4",
- "1cb7ce33be9345e992769fb7cdeb0e75",
- "f1204ffea0da4058a3973e6d79a8d36c",
- "b91aa35f8bfb4cb29724a0cf864a3158",
- "b225fd0da4c24d97a502a2df731d1037",
- "9ed0c298163645a8a10f7704354b3d2c",
- "3a2d93764f7645258777f75d2a33b214",
- "4d21de5d79b74e7d9dc5ccfb36827358",
- "927cb59be15747418fba1a56d7e22e21",
- "4a5e1f7a57d446e980090aae0325b990",
- "33175a3341134f7ebba6232440e9a770",
- "d503a8e5ea4f4bc089c4ae3e95ce1af4",
- "73ffa18b349849fdb7264b748b4189e9",
- "316f2f8a79ad4b0aa140f149383b2eff",
- "1c9b5e2acf0141898ab2a0639a79d209",
- "dd6707fe0bae4aab842dac25bf31880d",
- "4682a7ebe86a4a60ab6b793718435302",
- "1617b257e66c409db6c4ca0d0944a933",
- "63825f6200a944bd8c66602a64eee67c",
- "6cad7dfb6dd4441fb569c5533ef044e8",
- "1a76918edd75460e8d572e59d3aa5413",
- "1b3112662eb2481087fb3af6e79a4480",
- "23127b47d99d406c9a53520a3697972b",
- "1cb27bb3b5354879b7f1a73a24df923d",
- "77f646bb598d471cacdf772d9799a8df",
- "66782c677c2040d0ae19e7c6da6186ce",
- "c24f6df83a0b46ecbad2be4583d3bb1b",
- "9101630e52a04193804e02341e38830a",
- "9c9441eac4fe46078709fbf9c84c4a4e",
- "e9ecac569557483d89b848e31b1a4f85",
- "a641f0330b134a48844212dd72dafa57",
- "9e2c06d967be46ecbb56e0e0268c9a65",
- "da39e3fbf61941dc9fc05d00fb44a468",
- "a516325f85594525aac760a5c0d1a0d2",
- "55529d65863a4a5fb25dca02f0e885e2",
- "532e6cc744b54e12a677f33af75318f0",
- "c9c3f643f9b0472ab9dce2649139bb6a",
- "26d0829f64b248ada2b0f46b746cd8b1",
- "448556b65d2f419ca6cd395ce6d11f3f",
- "c0cf7a81656c4fd98d2418fd6336c6ae",
- "5c88eed231d14f2da8961a4ac7837417",
- "b4ca94c7f8534b4e857c57a619a7f116",
- "c18a7f2b29e54916ba81510b2bb21902",
- "067c697db37d43d8b6fa3b155a794f00",
- "006473c1d4a247208c17d3258909adb0",
- "8375e9fcaa4a46d895dc074cfed92149",
- "56cb8feab6c047ca8afb2acfda4d35d1",
- "29ce854a35e94a47af82522cc9f8a92b",
- "8e394c924a00479ba046afb5eeacc5f3",
- "86148800470449979a8baeb58b5f5c88",
- "386648192f9e403680aa57d1444e4465",
- "c12d9b3dfbe045a3bfba0ecd790af191",
- "0dbce80382dc41429050a896f3203c4e",
- "90e4273246e44f7c95db4456a00755a3",
- "d57525fd237d4c519e52c76ee7208a30",
- "6db6a832f6b44c3eb82f93fd60fda7fb",
- "dfcbee09be344b2f8b55ef1c9ddfbd76",
- "0428e3d1575c4ac6b6dfca617d144b7d",
- "dc42c19d950943a88630242dd188c1a7",
- "3fb33de4563749d7827c735380453b58",
- "3d8d6ea4a4ef4493b8033bcc62476375",
- "e7693807a9154e7482b4611be6421a0d",
- "150b6eaa9bd64dce908775d230740038",
- "4b59623304314a35b030ff805e5bf699",
- "1bf348fa5757429790b9272f037fc93a",
- "470138741a50479bb930f00a060cc61e",
- "589f8fbac4e0492e81e35cc6424a75bc",
- "2d92057e09554dcdbe405aafc0f602db",
- "6eb2d7bb05f442519211928645384c3a",
- "d2206237f06a4419a7304a199dff2e8a",
- "40f12f8bb6a04034b8c7a95d984469f2",
- "98e4143c2bbb42cea2566686eff2fa6a",
- "981b3a05c8ae42d29ffb81156ebc1a7d",
- "b8513aac81224b139347dfe5011f1563",
- "09c487bb35b6439aaa298665873ee84b",
- "da636d6c421f49f48ef43db194faae5e",
- "958bab205e204f87bce793f79869a28b",
- "8e93910fca484d93ab2eddea9540d307",
- "0a6226f65d354c55b3370c6e87dcc246",
- "685026baa834438aa8060a9e681c3263",
- "fe189eed0a834221bd8adb0bdc44b4c8"
- ]
- },
- "id": "N3iQ2aLEJkD9",
- "outputId": "b0f0d2c1-41dc-4932-990b-53d2912af19e"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import warnings\n",
- "import pandas as pd\n",
- "from tqdm.auto import tqdm\n",
- "from redisvl.utils.vectorize import HFTextVectorizer\n",
- "\n",
- "warnings.filterwarnings(\"ignore\")\n",
- "\n",
- "hf = HFTextVectorizer(\"sentence-transformers/all-MiniLM-L6-v2\")\n",
- "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
- "\n",
- "# Embed each chunk content\n",
- "embeddings = hf.embed_many([chunk.page_content for chunk in chunks])\n",
- "\n",
- "# Check to make sure we've created enough embeddings, 1 per document chunk\n",
- "len(embeddings) == len(chunks)"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "R2-i8jBl9GRH"
+ },
+ "source": [
+ "\n",
+ "\n",
+ "# RAG from scratch with the Redis Vector Library\n",
+ "\n",
+ "\n",
+ "In this recipe we will cover the basic of the Redis Vector Library and build a basic RAG app from scratch.\n",
+ "\n",
+ "## Let's Begin!\n",
+ "
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rT9HzsnQ1uiz"
+ },
+ "source": [
+ "## Environment Setup\n",
+ "\n",
+ "### Pull Github Materials\n",
+ "Because you are likely running this notebook in **Google Colab**, we need to first\n",
+ "pull the necessary dataset and materials directly from GitHub.\n",
+ "\n",
+ "**If you are running this notebook locally**, FYI you may not need to perform this\n",
+ "step at all."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "AJJ2UW6M1ui0",
+ "outputId": "0f5773b7-a292-4ee6-f4bd-20dc40ca2aba",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T04:41:18.607703Z",
+ "start_time": "2025-04-24T04:41:11.664107Z"
+ }
+ },
+ "source": [
+ "# NBVAL_SKIP\n",
+ "!git clone https://github.com/redis-developer/redis-ai-resources.git temp_repo\n",
+ "!mv temp_repo/python-recipes/RAG/resources .\n",
+ "!rm -rf temp_repo"
+ ],
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "5baI0xDQ1ui-"
- },
- "source": [
- "### Define a schema and create an index\n",
- "\n",
- "Below we connect to Redis and create an index that contains a text field, tag field, and vector field."
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Cloning into 'temp_repo'...\r\n",
+ "remote: Enumerating objects: 679, done.\u001B[K\r\n",
+ "remote: Counting objects: 100% (330/330), done.\u001B[Kjects: 82% (271/330)\u001B[K\r\n",
+ "remote: Compressing objects: 100% (214/214), done.\u001B[K\r\n",
+ "remote: Total 679 (delta 227), reused 148 (delta 115), pack-reused 349 (from 2)\u001B[K\r\n",
+ "Receiving objects: 100% (679/679), 57.80 MiB | 11.09 MiB/s, done.\r\n",
+ "Resolving deltas: 100% (295/295), done.\r\n",
+ "mv: rename temp_repo/python-recipes/RAG/resources to ./resources: Directory not empty\r\n"
+ ]
+ }
+ ],
+ "execution_count": 8
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Z67mf6T91ui2"
+ },
+ "source": [
+ "### Install Python Dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "DgxBQFXQ1ui2",
+ "outputId": "c3c399d6-e294-4a3a-a0a3-82d818509991",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T04:41:20.572419Z",
+ "start_time": "2025-04-24T04:41:18.616143Z"
+ }
+ },
+ "source": [
+ "%pip install -q redis \"redisvl>=0.4.1\" langchain-community pypdf sentence-transformers langchain openai pandas"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "id": "zB1EW_9n1ui-"
- },
- "outputs": [],
- "source": [
- "from redisvl.index import SearchIndex\n",
- "\n",
- "\n",
- "index_name = \"redisvl\"\n",
- "\n",
- "\n",
- "schema = {\n",
- " \"index\": {\n",
- " \"name\": index_name,\n",
- " \"prefix\": \"chunk\"\n",
- " },\n",
- " \"fields\": [\n",
- " {\n",
- " \"name\": \"chunk_id\",\n",
- " \"type\": \"tag\",\n",
- " \"attrs\": {\n",
- " \"sortable\": True\n",
- " }\n",
- " },\n",
- " {\n",
- " \"name\": \"content\",\n",
- " \"type\": \"text\"\n",
- " },\n",
- " {\n",
- " \"name\": \"text_embedding\",\n",
- " \"type\": \"vector\",\n",
- " \"attrs\": {\n",
- " \"dims\": 384,\n",
- " \"distance_metric\": \"cosine\",\n",
- " \"algorithm\": \"hnsw\",\n",
- " \"datatype\": \"float32\"\n",
- " }\n",
- " }\n",
- " ]\n",
- "}"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r\n",
+ "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.0.1\u001B[0m\r\n",
+ "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "execution_count": 9
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "peC8ThuVJkD7"
+ },
+ "source": [
+ "### Install Redis Stack\n",
+ "\n",
+ "Later in this tutorial, Redis will be used to store, index, and query vector\n",
+ "embeddings created from PDF document chunks. **We need to make sure we have a Redis\n",
+ "instance available.**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zMKHJ7oWJkD8"
+ },
+ "source": [
+ "#### For Colab\n",
+ "Use the shell script below to download, extract, and install [Redis Stack](https://redis.io/docs/getting-started/install-stack/) directly from the Redis package archive."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "LKuQku2CJkD9"
- },
- "outputs": [],
- "source": [
- "# create an index from schema and the client\n",
- "index = SearchIndex.from_dict(schema, redis_url=REDIS_URL)\n",
- "index.create(overwrite=True, drop=True)"
- ]
+ "id": "c0d5lfNxJkD8",
+ "outputId": "f96e72fa-b9f3-476f-bc9e-328bd30d1344"
+ },
+ "source": [
+ "# NBVAL_SKIP\n",
+ "%%sh\n",
+ "curl -fsSL https://packages.redis.io/gpg | sudo gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg\n",
+ "echo \"deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main\" | sudo tee /etc/apt/sources.list.d/redis.list\n",
+ "sudo apt-get update > /dev/null 2>&1\n",
+ "sudo apt-get install redis-stack-server > /dev/null 2>&1\n",
+ "redis-stack-server --daemonize yes"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2arb8Ic0JkD8"
+ },
+ "source": [
+ "#### For Alternative Environments\n",
+ "There are many ways to get the necessary redis-stack instance running\n",
+ "1. On cloud, deploy a [FREE instance of Redis in the cloud](https://redis.com/try-free/). Or, if you have your\n",
+ "own version of Redis Enterprise running, that works too!\n",
+ "2. Per OS, [see the docs](https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/)\n",
+ "3. With docker: `docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DhP1w0R9JkD8"
+ },
+ "source": [
+ "### Define the Redis Connection URL\n",
+ "\n",
+ "By default this notebook connects to the local instance of Redis Stack. **If you have your own Redis Enterprise instance** - replace REDIS_PASSWORD, REDIS_HOST and REDIS_PORT values with your own."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ggh5TzhkJkD9",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:45.583246Z",
+ "start_time": "2025-04-24T16:46:45.581177Z"
+ }
+ },
+ "source": [
+ "import os\n",
+ "\n",
+ "# Replace values below with your own if using Redis Cloud instance\n",
+ "REDIS_HOST = os.getenv(\"REDIS_HOST\", \"localhost\") # ex: \"redis-18374.c253.us-central1-1.gce.cloud.redislabs.com\"\n",
+ "REDIS_PORT = os.getenv(\"REDIS_PORT\", \"6379\") # ex: 18374\n",
+ "REDIS_PASSWORD = os.getenv(\"REDIS_PASSWORD\", \"\") # ex: \"1TNxTEdYRDgIDKM2gDfasupCADXXXX\"\n",
+ "\n",
+ "# If SSL is enabled on the endpoint, use rediss:// as the URL prefix\n",
+ "REDIS_URL = f\"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}\""
+ ],
+ "outputs": [],
+ "execution_count": 3
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "b3ErDmsIJkD9"
+ },
+ "source": [
+ "## Simplified Vector Search with RedisVL"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "KrtWWU4I1ui3"
+ },
+ "source": [
+ "### Dataset Preparation (PDF Documents)\n",
+ "\n",
+ "To best demonstrate Redis as a vector database layer, we will load a single\n",
+ "financial (10k filings) doc and preprocess it using some helpers from LangChain:\n",
+ "\n",
+ "- `PyPDFLoader` is not the only document loader type that LangChain provides. Docs: https://python.langchain.com/docs/integrations/document_loaders/pypdfloader/\n",
+ "- `RecursiveCharacterTextSplitter` is what we use to create smaller chunks of text from the doc. Docs: https://python.langchain.com/docs/how_to/recursive_text_splitter/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "uijl2qFH1ui3",
+ "outputId": "a99b3fcb-7cfd-4dbd-f258-57779cfcae3c",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:46.043726Z",
+ "start_time": "2025-04-24T16:46:45.600472Z"
+ }
+ },
+ "source": [
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+ "from langchain_community.document_loaders import PyPDFLoader\n",
+ "\n",
+ "# Load list of pdfs from a folder\n",
+ "data_path = \"resources/\"\n",
+ "docs = [os.path.join(data_path, file) for file in os.listdir(data_path)]\n",
+ "\n",
+ "print(\"Listing available documents ...\", docs)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "L6GOqmeN1ui_",
- "outputId": "91a199e3-d087-4b15-9544-d59efa6033c5"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[32m14:47:36\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n",
- "\u001b[32m14:47:36\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. redisvl\n",
- "\u001b[32m14:47:36\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 2. movies\n"
- ]
- }
- ],
- "source": [
- "# use the RedisVL CLI tool to list all indices\n",
- "!rvl index listall"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Listing available documents ... ['resources/nke-10k-2023.pdf', 'resources/amzn-10k-2023.pdf', 'resources/jnj-10k-2023.pdf', 'resources/aapl-10k-2023.pdf', 'resources/testset_15.csv', 'resources/retrieval_basic_rag_test.csv', 'resources/2022-chevy-colorado-ebrochure.pdf', 'resources/nvd-10k-2023.pdf', 'resources/testset.csv', 'resources/msft-10k-2023.pdf', 'resources/propositions.json', 'resources/generation_basic_rag_test.csv']\n"
+ ]
+ }
+ ],
+ "execution_count": 4
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "anya8hVnT6K_",
+ "outputId": "a8430acc-2e6d-45fd-fc8b-601fbbd8289b",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:50.509810Z",
+ "start_time": "2025-04-24T16:46:46.104219Z"
+ }
+ },
+ "source": [
+ "# pick out the Nike doc for this exercise\n",
+ "doc = [doc for doc in docs if \"nke\" in doc][0]\n",
+ "\n",
+ "# set up the file loader/extractor and text splitter to create chunks\n",
+ "text_splitter = RecursiveCharacterTextSplitter(\n",
+ " chunk_size=2500, chunk_overlap=0\n",
+ ")\n",
+ "loader = PyPDFLoader(doc, headers = None)\n",
+ "\n",
+ "# extract, load, and make chunks\n",
+ "chunks = loader.load_and_split(text_splitter)\n",
+ "\n",
+ "print(\"Done preprocessing. Created\", len(chunks), \"chunks of the original pdf\", doc)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "C70C-UWj1ujA",
- "outputId": "1fb7a2d6-ae6d-4536-b4b7-702620efd128"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "\n",
- "Index Information:\n",
- "╭──────────────┬────────────────┬────────────┬─────────────────┬────────────╮\n",
- "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n",
- "├──────────────┼────────────────┼────────────┼─────────────────┼────────────┤\n",
- "│ redisvl │ HASH │ ['chunk'] │ [] │ 0 │\n",
- "╰──────────────┴────────────────┴────────────┴─────────────────┴────────────╯\n",
- "Index Fields:\n",
- "╭────────────────┬────────────────┬────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────╮\n",
- "│ Name │ Attribute │ Type │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │\n",
- "├────────────────┼────────────────┼────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────────────────┼────────────────┼────────────────┼────────────────┼─────────────────┼────────────────┤\n",
- "│ chunk_id │ chunk_id │ TAG │ SEPARATOR │ , │ │ │ │ │ │ │ │ │ │ │\n",
- "│ content │ content │ TEXT │ WEIGHT │ 1 │ │ │ │ │ │ │ │ │ │ │\n",
- "│ text_embedding │ text_embedding │ VECTOR │ algorithm │ HNSW │ data_type │ FLOAT32 │ dim │ 384 │ distance_metric │ COSINE │ M │ 16 │ ef_construction │ 200 │\n",
- "╰────────────────┴────────────────┴────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴─────────────────┴────────────────┴────────────────┴────────────────┴─────────────────┴────────────────╯\n"
- ]
- }
- ],
- "source": [
- "# get info about the index\n",
- "!rvl index info -i redisvl"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done preprocessing. Created 211 chunks of the original pdf resources/nke-10k-2023.pdf\n"
+ ]
+ }
+ ],
+ "execution_count": 5
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fDN4XopTJkD9"
+ },
+ "source": [
+ "### Text embedding generation with RedisVL\n",
+ "RedisVL has built-in extensions and utilities to aid the GenAI development process. In the following snipit we utilize the HFTextVectorizer redisvl in tandem with the **all-MiniLM-L6-v2** class to generate vector embeddings for the chunks created above. These embeddings capture the \"meaning\" of the text so that we can retrieve the relevant chunks later when a user's query is semantically related."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 661,
+ "referenced_widgets": [
+ "cbd44245af844dca8e568691cc1c15c5",
+ "3109d0d320274ad0bb941608ee3df5e3",
+ "6c902ce903bb4e25a127ec277e2b2c45",
+ "954b76e059024b15be48fb5064ab2fb7",
+ "160c4567015f4b1bba43dc7e1e4712fb",
+ "712fcb54fabc430c9567240a2ddd4a76",
+ "f96ce89375924097ab9f4cd130fd7b41",
+ "58c687581a8d4d3a828686cd066a32b3",
+ "df2305a9a6634dffbc08567f62047b27",
+ "218e8977786b42e1b825a14d44164d82",
+ "8bc8cb91c6274c08a72c91c91dddf4ef",
+ "abee8aeb772f48dab4661dca40277788",
+ "300b9716084a4a24bf479ae7200b87d1",
+ "ff76433f165146f0b39d2488a33b318e",
+ "98fe1e1e066541ec942a05ec416fa53f",
+ "be9c6f9905fd440884261e09367fe659",
+ "9d7bd9a50eea407eb60c41c1534f295d",
+ "968f389c21cf469daee8284a7b14c251",
+ "39f7677d9d8a4bdf8f4eb4756fae3ed2",
+ "959248b437054a43a0393c71a603b35f",
+ "6b3711002db148f790eab617f7f40eb4",
+ "5a3363012166483d90abb10b476772bf",
+ "92e02308d4d94725b73cc324d8cd9906",
+ "6fe679c08e2b46dd8657160d974912e0",
+ "61fc922ce98c4fefbebe7bb6a8ee9317",
+ "2cc139350de742989b6e24d70e490a54",
+ "995465a251f64f7a9c1e5541a7f28d4d",
+ "56b8c445444b4d39b2c9fb199586ff93",
+ "5f2ad751dab24f6aaae736c01e582c14",
+ "54331fe70c934a7894903d5ca7a960ce",
+ "6270fcf4772f40d59a6f6842060f36a4",
+ "14e24b722ecf47a49ebe42e8c3492c1e",
+ "b5e36e428e3541fd8a237d0f28a023e1",
+ "6aa3f285fd8a4a84882b7bece1b639ac",
+ "d20425f4a0594c319bc51ee60d773f79",
+ "a046d9ff7e1d4577ab28315d681ac36b",
+ "c9468d94408a4d36a20eae07624a6a09",
+ "902551f09b44499b8c8dd88bbdf50a4a",
+ "5477b553050e42c0b8ed7c2c8c17c025",
+ "fcbac845d7c24db6a85e82f190e69a75",
+ "82f4af2b827c4d98a762c2e7ebd03d6e",
+ "146de95acc214f60b854553ab983b7ae",
+ "a356517795234ab6abb3ffd71b05f296",
+ "1757bba5dca64bf3b7d359cd2537e9c5",
+ "59d890877f8b4f7aa436fa4b82e4cf8d",
+ "9a0acbad43204038b8ca4edeeb0e0d61",
+ "38518362236e470898cdbfb48ee0d381",
+ "9aac56d1808d490797bbb175c5afb226",
+ "2f848e63b87847d1a299c04052d567d6",
+ "52395bed9f6d455897d8d489e7dcb0d3",
+ "4e2332a6f482448597a9d4988fec7cf6",
+ "ac55276fbd5a4404ba065a19849119c5",
+ "fae66f22c38247ad85078f6ad2530ced",
+ "a3fcad6db08c4f07adf4ee817afce77a",
+ "557fb6c9f787412a8bff6f4798087bb7",
+ "a4c7c73d90cf44acb43740b223be8101",
+ "010e7ce97cfb43f195d1dd1811584ea2",
+ "484f1fc0b5844726b3ac203440ddbdc8",
+ "9368d437c3534a33b0010ea77be8a5e2",
+ "50c576ca5f914c65aeb5b7c03f4b0fa2",
+ "80bcb933a16c40788a3ad354e545acfe",
+ "2bfc17a97664452787740dc202eae370",
+ "600f4d36b66d40ecb8353db981d0f1f4",
+ "1cb7ce33be9345e992769fb7cdeb0e75",
+ "f1204ffea0da4058a3973e6d79a8d36c",
+ "b91aa35f8bfb4cb29724a0cf864a3158",
+ "b225fd0da4c24d97a502a2df731d1037",
+ "9ed0c298163645a8a10f7704354b3d2c",
+ "3a2d93764f7645258777f75d2a33b214",
+ "4d21de5d79b74e7d9dc5ccfb36827358",
+ "927cb59be15747418fba1a56d7e22e21",
+ "4a5e1f7a57d446e980090aae0325b990",
+ "33175a3341134f7ebba6232440e9a770",
+ "d503a8e5ea4f4bc089c4ae3e95ce1af4",
+ "73ffa18b349849fdb7264b748b4189e9",
+ "316f2f8a79ad4b0aa140f149383b2eff",
+ "1c9b5e2acf0141898ab2a0639a79d209",
+ "dd6707fe0bae4aab842dac25bf31880d",
+ "4682a7ebe86a4a60ab6b793718435302",
+ "1617b257e66c409db6c4ca0d0944a933",
+ "63825f6200a944bd8c66602a64eee67c",
+ "6cad7dfb6dd4441fb569c5533ef044e8",
+ "1a76918edd75460e8d572e59d3aa5413",
+ "1b3112662eb2481087fb3af6e79a4480",
+ "23127b47d99d406c9a53520a3697972b",
+ "1cb27bb3b5354879b7f1a73a24df923d",
+ "77f646bb598d471cacdf772d9799a8df",
+ "66782c677c2040d0ae19e7c6da6186ce",
+ "c24f6df83a0b46ecbad2be4583d3bb1b",
+ "9101630e52a04193804e02341e38830a",
+ "9c9441eac4fe46078709fbf9c84c4a4e",
+ "e9ecac569557483d89b848e31b1a4f85",
+ "a641f0330b134a48844212dd72dafa57",
+ "9e2c06d967be46ecbb56e0e0268c9a65",
+ "da39e3fbf61941dc9fc05d00fb44a468",
+ "a516325f85594525aac760a5c0d1a0d2",
+ "55529d65863a4a5fb25dca02f0e885e2",
+ "532e6cc744b54e12a677f33af75318f0",
+ "c9c3f643f9b0472ab9dce2649139bb6a",
+ "26d0829f64b248ada2b0f46b746cd8b1",
+ "448556b65d2f419ca6cd395ce6d11f3f",
+ "c0cf7a81656c4fd98d2418fd6336c6ae",
+ "5c88eed231d14f2da8961a4ac7837417",
+ "b4ca94c7f8534b4e857c57a619a7f116",
+ "c18a7f2b29e54916ba81510b2bb21902",
+ "067c697db37d43d8b6fa3b155a794f00",
+ "006473c1d4a247208c17d3258909adb0",
+ "8375e9fcaa4a46d895dc074cfed92149",
+ "56cb8feab6c047ca8afb2acfda4d35d1",
+ "29ce854a35e94a47af82522cc9f8a92b",
+ "8e394c924a00479ba046afb5eeacc5f3",
+ "86148800470449979a8baeb58b5f5c88",
+ "386648192f9e403680aa57d1444e4465",
+ "c12d9b3dfbe045a3bfba0ecd790af191",
+ "0dbce80382dc41429050a896f3203c4e",
+ "90e4273246e44f7c95db4456a00755a3",
+ "d57525fd237d4c519e52c76ee7208a30",
+ "6db6a832f6b44c3eb82f93fd60fda7fb",
+ "dfcbee09be344b2f8b55ef1c9ddfbd76",
+ "0428e3d1575c4ac6b6dfca617d144b7d",
+ "dc42c19d950943a88630242dd188c1a7",
+ "3fb33de4563749d7827c735380453b58",
+ "3d8d6ea4a4ef4493b8033bcc62476375",
+ "e7693807a9154e7482b4611be6421a0d",
+ "150b6eaa9bd64dce908775d230740038",
+ "4b59623304314a35b030ff805e5bf699",
+ "1bf348fa5757429790b9272f037fc93a",
+ "470138741a50479bb930f00a060cc61e",
+ "589f8fbac4e0492e81e35cc6424a75bc",
+ "2d92057e09554dcdbe405aafc0f602db",
+ "6eb2d7bb05f442519211928645384c3a",
+ "d2206237f06a4419a7304a199dff2e8a",
+ "40f12f8bb6a04034b8c7a95d984469f2",
+ "98e4143c2bbb42cea2566686eff2fa6a",
+ "981b3a05c8ae42d29ffb81156ebc1a7d",
+ "b8513aac81224b139347dfe5011f1563",
+ "09c487bb35b6439aaa298665873ee84b",
+ "da636d6c421f49f48ef43db194faae5e",
+ "958bab205e204f87bce793f79869a28b",
+ "8e93910fca484d93ab2eddea9540d307",
+ "0a6226f65d354c55b3370c6e87dcc246",
+ "685026baa834438aa8060a9e681c3263",
+ "fe189eed0a834221bd8adb0bdc44b4c8"
+ ]
},
+ "id": "N3iQ2aLEJkD9",
+ "outputId": "b0f0d2c1-41dc-4932-990b-53d2912af19e",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:55.588165Z",
+ "start_time": "2025-04-24T16:46:50.528240Z"
+ }
+ },
+ "source": [
+ "import warnings\n",
+ "import pandas as pd\n",
+ "from redisvl.utils.vectorize import HFTextVectorizer, BaseVectorizer\n",
+ "\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "hf = HFTextVectorizer(\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+ "\n",
+ "# Embed each chunk content\n",
+ "embeddings = hf.embed_many([chunk.page_content for chunk in chunks])\n",
+ "\n",
+ "# Check to make sure we've created enough embeddings, 1 per document chunk\n",
+ "len(embeddings) == len(chunks)"
+ ],
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "Qrj-jeGmBRTL"
- },
- "source": [
- "### Process and load dataset\n",
- "Below we use the RedisVL index to simply load the list of document chunks to Redis db."
+ "data": {
+ "text/plain": [
+ "True"
]
- },
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 6
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5baI0xDQ1ui-"
+ },
+ "source": [
+ "### Define a schema and create an index\n",
+ "\n",
+ "Below we connect to Redis and create an index that contains a text field, tag field, and vector field."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "zB1EW_9n1ui-",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:55.611260Z",
+ "start_time": "2025-04-24T16:46:55.598846Z"
+ }
+ },
+ "source": [
+ "from redisvl.index import SearchIndex\n",
+ "\n",
+ "\n",
+ "index_name = \"redisvl\"\n",
+ "\n",
+ "\n",
+ "schema = {\n",
+ " \"index\": {\n",
+ " \"name\": index_name,\n",
+ " \"prefix\": \"chunk\"\n",
+ " },\n",
+ " \"fields\": [\n",
+ " {\n",
+ " \"name\": \"chunk_id\",\n",
+ " \"type\": \"tag\",\n",
+ " \"attrs\": {\n",
+ " \"sortable\": True\n",
+ " }\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"content\",\n",
+ " \"type\": \"text\"\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"text_embedding\",\n",
+ " \"type\": \"vector\",\n",
+ " \"attrs\": {\n",
+ " \"dims\": 384,\n",
+ " \"distance_metric\": \"cosine\",\n",
+ " \"algorithm\": \"hnsw\",\n",
+ " \"datatype\": \"float32\"\n",
+ " }\n",
+ " }\n",
+ " ]\n",
+ "}"
+ ],
+ "outputs": [],
+ "execution_count": 7
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "LKuQku2CJkD9",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:55.630056Z",
+ "start_time": "2025-04-24T16:46:55.620207Z"
+ }
+ },
+ "source": [
+ "# create an index from schema and the client\n",
+ "index = SearchIndex.from_dict(schema, redis_url=REDIS_URL)\n",
+ "index.create(overwrite=True, drop=True)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "id": "Zsg09Keg1ujA"
- },
- "outputs": [],
- "source": [
- "# load expects an iterable of dictionaries\n",
- "from redisvl.redis.utils import array_to_buffer\n",
- "\n",
- "data = [\n",
- " {\n",
- " 'chunk_id': i,\n",
- " 'content': chunk.page_content,\n",
- " # For HASH -- must convert embeddings to bytes\n",
- " 'text_embedding': array_to_buffer(embeddings[i], dtype='float32')\n",
- " } for i, chunk in enumerate(chunks)\n",
- "]\n",
- "\n",
- "# RedisVL handles batching automatically\n",
- "keys = index.load(data, id_field=\"chunk_id\")"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "09:46:55 redisvl.index.index INFO Index already exists, overwriting.\n"
+ ]
+ }
+ ],
+ "execution_count": 8
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-ZsFB-6Z1ujB"
- },
- "source": [
- "### Query the database\n",
- "Now we can use the RedisVL index to perform similarity search operations with Redis"
- ]
+ "id": "L6GOqmeN1ui_",
+ "outputId": "91a199e3-d087-4b15-9544-d59efa6033c5"
+ },
+ "source": [
+ "# use the RedisVL CLI tool to list all indices\n",
+ "!rvl index listall"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "C70C-UWj1ujA",
+ "outputId": "1fb7a2d6-ae6d-4536-b4b7-702620efd128",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:56.828176Z",
+ "start_time": "2025-04-24T16:46:56.283831Z"
+ }
+ },
+ "source": [
+ "# get info about the index\n",
+ "!rvl index info -i redisvl"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 85,
- "referenced_widgets": [
- "c75d5ab2049146e580efab9da9bbcdb0",
- "9ce1fb951e79468baa9d1aebfa4c4fae",
- "e96d1546380146078c18ec78363f7dac",
- "a3c36bb0d3b74c8ea56bf03521465b81",
- "9f306cfd66dc441aba923d4e051911fc",
- "9e3289444cb142c29ad7d569be2e25b8",
- "c20443e17308425596679c0544dab528",
- "f0bdd8f4d7b84bd5a1c209c591ce8787",
- "126743b52b254e54aa4f65bcb9e65aea",
- "debae380e6d24fb8ae712a6dd2226152",
- "aacb6f8ca39846d89e1e4e96656e3a36"
- ]
- },
- "id": "BkFv-_iC1ujB",
- "outputId": "c398d356-6bb7-43a9-ca95-cb7f167d1f38"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'*=>[KNN 3 @text_embedding $vector AS vector_distance] RETURN 3 chunk_id content vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3'"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from redisvl.query import VectorQuery\n",
- "\n",
- "query = \"Nike profit margins and company performance\"\n",
- "\n",
- "query_embedding = hf.embed(query)\n",
- "\n",
- "vector_query = VectorQuery(\n",
- " vector=query_embedding,\n",
- " vector_field_name=\"text_embedding\",\n",
- " num_results=3,\n",
- " return_fields=[\"chunk_id\", \"content\"],\n",
- " return_score=True\n",
- ")\n",
- "\n",
- "# show the raw redis query\n",
- "str(vector_query)"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\r\n",
+ "\r\n",
+ "Index Information:\r\n",
+ "╭──────────────┬────────────────┬────────────┬─────────────────┬────────────╮\r\n",
+ "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\r\n",
+ "├──────────────┼────────────────┼────────────┼─────────────────┼────────────┤\r\n",
+ "│ redisvl │ HASH │ ['chunk'] │ [] │ 0 │\r\n",
+ "╰──────────────┴────────────────┴────────────┴─────────────────┴────────────╯\r\n",
+ "Index Fields:\r\n",
+ "╭────────────────┬────────────────┬────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────╮\r\n",
+ "│ Name │ Attribute │ Type │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │\r\n",
+ "├────────────────┼────────────────┼────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────────────────┼────────────────┼────────────────┼────────────────┼─────────────────┼────────────────┤\r\n",
+ "│ chunk_id │ chunk_id │ TAG │ SEPARATOR │ , │ │ │ │ │ │ │ │ │ │ │\r\n",
+ "│ content │ content │ TEXT │ WEIGHT │ 1 │ │ │ │ │ │ │ │ │ │ │\r\n",
+ "│ text_embedding │ text_embedding │ VECTOR │ algorithm │ HNSW │ data_type │ FLOAT32 │ dim │ 384 │ distance_metric │ COSINE │ M │ 16 │ ef_construction │ 200 │\r\n",
+ "╰────────────────┴────────────────┴────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴─────────────────┴────────────────┴────────────────┴────────────────┴─────────────────┴────────────────╯\r\n"
+ ]
+ }
+ ],
+ "execution_count": 10
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Qrj-jeGmBRTL"
+ },
+ "source": [
+ "### Process and load dataset\n",
+ "Below we use the RedisVL index to simply load the list of document chunks to Redis db."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Zsg09Keg1ujA",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:56.895623Z",
+ "start_time": "2025-04-24T16:46:56.836700Z"
+ }
+ },
+ "source": [
+ "# load expects an iterable of dictionaries\n",
+ "from redisvl.redis.utils import array_to_buffer\n",
+ "\n",
+ "data = [\n",
+ " {\n",
+ " 'chunk_id': i,\n",
+ " 'content': chunk.page_content,\n",
+ " # For HASH -- must convert embeddings to bytes\n",
+ " 'text_embedding': array_to_buffer(embeddings[i], dtype='float32')\n",
+ " } for i, chunk in enumerate(chunks)\n",
+ "]\n",
+ "\n",
+ "# RedisVL handles batching automatically\n",
+ "keys = index.load(data, id_field=\"chunk_id\")"
+ ],
+ "outputs": [],
+ "execution_count": 11
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-ZsFB-6Z1ujB"
+ },
+ "source": [
+ "### Query the database\n",
+ "Now we can use the RedisVL index to perform similarity search operations with Redis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 85,
+ "referenced_widgets": [
+ "c75d5ab2049146e580efab9da9bbcdb0",
+ "9ce1fb951e79468baa9d1aebfa4c4fae",
+ "e96d1546380146078c18ec78363f7dac",
+ "a3c36bb0d3b74c8ea56bf03521465b81",
+ "9f306cfd66dc441aba923d4e051911fc",
+ "9e3289444cb142c29ad7d569be2e25b8",
+ "c20443e17308425596679c0544dab528",
+ "f0bdd8f4d7b84bd5a1c209c591ce8787",
+ "126743b52b254e54aa4f65bcb9e65aea",
+ "debae380e6d24fb8ae712a6dd2226152",
+ "aacb6f8ca39846d89e1e4e96656e3a36"
+ ]
},
+ "id": "BkFv-_iC1ujB",
+ "outputId": "c398d356-6bb7-43a9-ca95-cb7f167d1f38",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:56.991529Z",
+ "start_time": "2025-04-24T16:46:56.903370Z"
+ }
+ },
+ "source": [
+ "from redisvl.query import VectorQuery\n",
+ "\n",
+ "query = \"Nike profit margins and company performance\"\n",
+ "\n",
+ "query_embedding = hf.embed(query)\n",
+ "\n",
+ "vector_query = VectorQuery(\n",
+ " vector=query_embedding,\n",
+ " vector_field_name=\"text_embedding\",\n",
+ " num_results=3,\n",
+ " return_fields=[\"chunk_id\", \"content\"],\n",
+ " return_score=True\n",
+ ")\n",
+ "\n",
+ "# show the raw redis query\n",
+ "str(vector_query)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 143
- },
- "id": "5reL5qTW1ujC",
- "outputId": "dd58f191-54f5-4226-c4e1-70207d58f2dc"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " vector_distance | \n",
- " chunk_id | \n",
- " content | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " chunk:87 | \n",
- " 0.334264576435 | \n",
- " 87 | \n",
- " Asia Pacific & Latin America 1,932 1,896 2 % 1... | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " chunk:79 | \n",
- " 0.350993335247 | \n",
- " 79 | \n",
- " Table of Contents\\nCONSOLIDA TED OPERA TING RE... | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " chunk:86 | \n",
- " 0.371814072132 | \n",
- " 86 | \n",
- " Table of Contents\\nOPERA TING SEGMENTS\\nAs dis... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id vector_distance chunk_id \\\n",
- "0 chunk:87 0.334264576435 87 \n",
- "1 chunk:79 0.350993335247 79 \n",
- "2 chunk:86 0.371814072132 86 \n",
- "\n",
- " content \n",
- "0 Asia Pacific & Latin America 1,932 1,896 2 % 1... \n",
- "1 Table of Contents\\nCONSOLIDA TED OPERA TING RE... \n",
- "2 Table of Contents\\nOPERA TING SEGMENTS\\nAs dis... "
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# execute the query with RedisVL\n",
- "result=index.query(vector_query)\n",
- "\n",
- "# view the results\n",
- "pd.DataFrame(result)"
+ "data": {
+ "text/plain": [
+ "'*=>[KNN 3 @text_embedding $vector AS vector_distance] RETURN 3 chunk_id content vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3'"
]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 12
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 143
},
+ "id": "5reL5qTW1ujC",
+ "outputId": "dd58f191-54f5-4226-c4e1-70207d58f2dc",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.008139Z",
+ "start_time": "2025-04-24T16:46:56.999381Z"
+ }
+ },
+ "source": [
+ "# execute the query with RedisVL\n",
+ "result=index.query(vector_query)\n",
+ "\n",
+ "# view the results\n",
+ "pd.DataFrame(result)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "rZrcd6n7T6LE",
- "outputId": "fad67a63-76bd-43b9-f62b-b1842ba47605"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "87 0.334264576435\n",
- "79 0.350993335247\n",
- "86 0.371814072132\n"
- ]
- }
+ "data": {
+ "text/plain": [
+ " id vector_distance chunk_id \\\n",
+ "0 chunk:88 0.337694525719 88 \n",
+ "1 chunk:80 0.34205275774 80 \n",
+ "2 chunk:87 0.357761025429 87 \n",
+ "\n",
+ " content \n",
+ "0 Asia Pacific & Latin America 1,932 1,896 2 % 1... \n",
+ "1 Table of Contents\\nCONSOLIDATED OPERATING RESU... \n",
+ "2 Table of Contents\\nOPERATING SEGMENTS\\nAs disc... "
],
- "source": [
- "# paginate through results\n",
- "for result in index.paginate(vector_query, page_size=1):\n",
- " print(result[0][\"chunk_id\"], result[0][\"vector_distance\"], flush=True)"
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " vector_distance | \n",
+ " chunk_id | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " chunk:88 | \n",
+ " 0.337694525719 | \n",
+ " 88 | \n",
+ " Asia Pacific & Latin America 1,932 1,896 2 % 1... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " chunk:80 | \n",
+ " 0.34205275774 | \n",
+ " 80 | \n",
+ " Table of Contents\\nCONSOLIDATED OPERATING RESU... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " chunk:87 | \n",
+ " 0.357761025429 | \n",
+ " 87 | \n",
+ " Table of Contents\\nOPERATING SEGMENTS\\nAs disc... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 13
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "rZrcd6n7T6LE",
+ "outputId": "fad67a63-76bd-43b9-f62b-b1842ba47605",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.075644Z",
+ "start_time": "2025-04-24T16:46:57.067304Z"
+ }
+ },
+ "source": [
+ "# paginate through results\n",
+ "for result in index.paginate(vector_query, page_size=1):\n",
+ " print(result[0][\"chunk_id\"], result[0][\"vector_distance\"], flush=True)"
+ ],
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "0Ap6WqPLT6LE"
- },
- "source": [
- "### Sort by alternative fields"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "88 0.337694525719\n",
+ "80 0.34205275774\n",
+ "87 0.357761025429\n"
+ ]
+ }
+ ],
+ "execution_count": 14
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0Ap6WqPLT6LE"
+ },
+ "source": [
+ "### Sort by alternative fields"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 175
},
+ "id": "daLVm6OkLn9T",
+ "outputId": "d77dfc4c-d451-4bf5-91c3-2155232570b9",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.172397Z",
+ "start_time": "2025-04-24T16:46:57.167834Z"
+ }
+ },
+ "source": [
+ "# Sort by chunk_id field after vector search limits to topK\n",
+ "vector_query = VectorQuery(\n",
+ " vector=query_embedding,\n",
+ " vector_field_name=\"text_embedding\",\n",
+ " num_results=4,\n",
+ " return_fields=[\"chunk_id\"],\n",
+ " return_score=True\n",
+ ")\n",
+ "\n",
+ "# Decompose vector_query into the core query and the params\n",
+ "query = vector_query.query\n",
+ "params = vector_query.params\n",
+ "\n",
+ "# Pass query and params direct to index.search()\n",
+ "result = index.search(\n",
+ " query.sort_by(\"chunk_id\", asc=True),\n",
+ " params\n",
+ ")\n",
+ "\n",
+ "pd.DataFrame([doc.__dict__ for doc in result.docs])\n"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 175
- },
- "id": "daLVm6OkLn9T",
- "outputId": "d77dfc4c-d451-4bf5-91c3-2155232570b9"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " payload | \n",
- " vector_distance | \n",
- " chunk_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " chunk:79 | \n",
- " None | \n",
- " 0.350993335247 | \n",
- " 79 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " chunk:82 | \n",
- " None | \n",
- " 0.378765702248 | \n",
- " 82 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " chunk:86 | \n",
- " None | \n",
- " 0.371814072132 | \n",
- " 86 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " chunk:87 | \n",
- " None | \n",
- " 0.334264576435 | \n",
- " 87 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id payload vector_distance chunk_id\n",
- "0 chunk:79 None 0.350993335247 79\n",
- "1 chunk:82 None 0.378765702248 82\n",
- "2 chunk:86 None 0.371814072132 86\n",
- "3 chunk:87 None 0.334264576435 87"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/plain": [
+ " id payload vector_distance chunk_id\n",
+ "0 chunk:80 None 0.34205275774 80\n",
+ "1 chunk:83 None 0.378765881062 83\n",
+ "2 chunk:87 None 0.357761025429 87\n",
+ "3 chunk:88 None 0.337694525719 88"
],
- "source": [
- "# Sort by chunk_id field after vector search limits to topK\n",
- "vector_query = VectorQuery(\n",
- " vector=query_embedding,\n",
- " vector_field_name=\"text_embedding\",\n",
- " num_results=4,\n",
- " return_fields=[\"chunk_id\"],\n",
- " return_score=True\n",
- ")\n",
- "\n",
- "# Decompose vector_query into the core query and the params\n",
- "query = vector_query.query\n",
- "params = vector_query.params\n",
- "\n",
- "# Pass query and params direct to index.search()\n",
- "result = index.search(\n",
- " query.sort_by(\"chunk_id\", asc=True),\n",
- " params\n",
- ")\n",
- "\n",
- "pd.DataFrame([doc.__dict__ for doc in result.docs])\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "81PoXomtT6LF"
- },
- "source": [
- "### Add filters to vector queries"
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " payload | \n",
+ " vector_distance | \n",
+ " chunk_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " chunk:80 | \n",
+ " None | \n",
+ " 0.34205275774 | \n",
+ " 80 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " chunk:83 | \n",
+ " None | \n",
+ " 0.378765881062 | \n",
+ " 83 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " chunk:87 | \n",
+ " None | \n",
+ " 0.357761025429 | \n",
+ " 87 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " chunk:88 | \n",
+ " None | \n",
+ " 0.337694525719 | \n",
+ " 88 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 15
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "81PoXomtT6LF"
+ },
+ "source": [
+ "### Add filters to vector queries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 175
},
+ "id": "a11G3xXJ1ujC",
+ "outputId": "d968add5-704d-4e22-d3bd-97c1d1103a75",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.279677Z",
+ "start_time": "2025-04-24T16:46:57.274997Z"
+ }
+ },
+ "source": [
+ "from redisvl.query.filter import Text\n",
+ "\n",
+ "vector_query = VectorQuery(\n",
+ " vector=query_embedding,\n",
+ " vector_field_name=\"text_embedding\",\n",
+ " num_results=4,\n",
+ " return_fields=[\"content\"],\n",
+ " return_score=True\n",
+ ")\n",
+ "\n",
+ "# Set a text filter\n",
+ "text_filter = Text(\"content\") % \"profit\"\n",
+ "\n",
+ "vector_query.set_filter(text_filter)\n",
+ "\n",
+ "result=index.query(vector_query)\n",
+ "pd.DataFrame(result)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 175
- },
- "id": "a11G3xXJ1ujC",
- "outputId": "d968add5-704d-4e22-d3bd-97c1d1103a75"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " vector_distance | \n",
- " content | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " chunk:82 | \n",
- " 0.378765702248 | \n",
- " Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " chunk:128 | \n",
- " 0.418757259846 | \n",
- " Table of Contents\\nNIKE, INC.\\nCONSOLIDATED ST... | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " chunk:72 | \n",
- " 0.466709017754 | \n",
- " Table of Contents\\nITEM 7. MANAGEM ENT'S DISCU... | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " chunk:62 | \n",
- " 0.493393957615 | \n",
- " existing businesses, such as our NIKE Direct o... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id vector_distance \\\n",
- "0 chunk:82 0.378765702248 \n",
- "1 chunk:128 0.418757259846 \n",
- "2 chunk:72 0.466709017754 \n",
- "3 chunk:62 0.493393957615 \n",
- "\n",
- " content \n",
- "0 Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... \n",
- "1 Table of Contents\\nNIKE, INC.\\nCONSOLIDATED ST... \n",
- "2 Table of Contents\\nITEM 7. MANAGEM ENT'S DISCU... \n",
- "3 existing businesses, such as our NIKE Direct o... "
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/plain": [
+ " id vector_distance \\\n",
+ "0 chunk:83 0.378765881062 \n",
+ "1 chunk:129 0.418757200241 \n",
+ "2 chunk:73 0.465415120125 \n",
+ "3 chunk:63 0.49339401722 \n",
+ "\n",
+ " content \n",
+ "0 Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... \n",
+ "1 Table of Contents\\nNIKE, INC.\\nCONSOLIDATED ST... \n",
+ "2 Table of Contents\\nITEM 7. MANAGEMENT'S DISCUS... \n",
+ "3 existing businesses, such as our NIKE Direct o... "
],
- "source": [
- "from redisvl.query.filter import Text\n",
- "\n",
- "vector_query = VectorQuery(\n",
- " vector=query_embedding,\n",
- " vector_field_name=\"text_embedding\",\n",
- " num_results=4,\n",
- " return_fields=[\"content\"],\n",
- " return_score=True\n",
- ")\n",
- "\n",
- "# Set a text filter\n",
- "text_filter = Text(\"content\") % \"profit\"\n",
- "\n",
- "vector_query.set_filter(text_filter)\n",
- "\n",
- "result=index.query(vector_query)\n",
- "pd.DataFrame(result)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "5XvVv8zAT6LF"
- },
- "source": [
- "### Range queries in RedisVL"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "id": "bCffoZRx1ujD"
- },
- "outputs": [],
- "source": [
- "from redisvl.query import RangeQuery\n",
- "\n",
- "range_query = RangeQuery(\n",
- " vector=query_embedding,\n",
- " vector_field_name=\"text_embedding\",\n",
- " num_results=4,\n",
- " return_fields=[\"content\"],\n",
- " return_score=True,\n",
- " distance_threshold=0.8 # find all items with a semantic distance of less than 0.8\n",
- ")"
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " vector_distance | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " chunk:83 | \n",
+ " 0.378765881062 | \n",
+ " Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " chunk:129 | \n",
+ " 0.418757200241 | \n",
+ " Table of Contents\\nNIKE, INC.\\nCONSOLIDATED ST... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " chunk:73 | \n",
+ " 0.465415120125 | \n",
+ " Table of Contents\\nITEM 7. MANAGEMENT'S DISCUS... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " chunk:63 | \n",
+ " 0.49339401722 | \n",
+ " existing businesses, such as our NIKE Direct o... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 16
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5XvVv8zAT6LF"
+ },
+ "source": [
+ "### Range queries in RedisVL"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "bCffoZRx1ujD",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.391116Z",
+ "start_time": "2025-04-24T16:46:57.389349Z"
+ }
+ },
+ "source": [
+ "from redisvl.query import RangeQuery\n",
+ "\n",
+ "range_query = RangeQuery(\n",
+ " vector=query_embedding,\n",
+ " vector_field_name=\"text_embedding\",\n",
+ " num_results=4,\n",
+ " return_fields=[\"content\"],\n",
+ " return_score=True,\n",
+ " distance_threshold=0.8 # find all items with a semantic distance of less than 0.8\n",
+ ")"
+ ],
+ "outputs": [],
+ "execution_count": 17
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 175
},
+ "id": "0gHmam1Q1ujD",
+ "outputId": "ac80a6ed-4eb8-44d3-881d-87c9271aa10e",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.499232Z",
+ "start_time": "2025-04-24T16:46:57.494328Z"
+ }
+ },
+ "source": [
+ "result=index.query(range_query)\n",
+ "pd.DataFrame(result)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 175
- },
- "id": "0gHmam1Q1ujD",
- "outputId": "ac80a6ed-4eb8-44d3-881d-87c9271aa10e"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " vector_distance | \n",
- " content | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " chunk:87 | \n",
- " 0.334264576435 | \n",
- " Asia Pacific & Latin America 1,932 1,896 2 % 1... | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " chunk:79 | \n",
- " 0.350993335247 | \n",
- " Table of Contents\\nCONSOLIDA TED OPERA TING RE... | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " chunk:86 | \n",
- " 0.371814072132 | \n",
- " Table of Contents\\nOPERA TING SEGMENTS\\nAs dis... | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " chunk:82 | \n",
- " 0.378765702248 | \n",
- " Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id vector_distance content\n",
- "0 chunk:87 0.334264576435 Asia Pacific & Latin America 1,932 1,896 2 % 1...\n",
- "1 chunk:79 0.350993335247 Table of Contents\\nCONSOLIDA TED OPERA TING RE...\n",
- "2 chunk:86 0.371814072132 Table of Contents\\nOPERA TING SEGMENTS\\nAs dis...\n",
- "3 chunk:82 0.378765702248 Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C..."
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/plain": [
+ " id vector_distance content\n",
+ "0 chunk:88 0.337694525719 Asia Pacific & Latin America 1,932 1,896 2 % 1...\n",
+ "1 chunk:80 0.34205275774 Table of Contents\\nCONSOLIDATED OPERATING RESU...\n",
+ "2 chunk:87 0.357761025429 Table of Contents\\nOPERATING SEGMENTS\\nAs disc...\n",
+ "3 chunk:83 0.378765881062 Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C..."
],
- "source": [
- "result=index.query(range_query)\n",
- "pd.DataFrame(result)"
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " vector_distance | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " chunk:88 | \n",
+ " 0.337694525719 | \n",
+ " Asia Pacific & Latin America 1,932 1,896 2 % 1... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " chunk:80 | \n",
+ " 0.34205275774 | \n",
+ " Table of Contents\\nCONSOLIDATED OPERATING RESU... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " chunk:87 | \n",
+ " 0.357761025429 | \n",
+ " Table of Contents\\nOPERATING SEGMENTS\\nAs disc... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " chunk:83 | \n",
+ " 0.378765881062 | \n",
+ " Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 18
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 175
},
+ "id": "YZg4U21r1ujD",
+ "outputId": "d3db5ac3-6ae9-42c4-aaee-874cecafe3ad",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.667013Z",
+ "start_time": "2025-04-24T16:46:57.662153Z"
+ }
+ },
+ "source": [
+ "# Add filter to range query\n",
+ "range_query.set_filter(text_filter)\n",
+ "\n",
+ "index.query(range_query)\n",
+ "pd.DataFrame(result)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 175
- },
- "id": "YZg4U21r1ujD",
- "outputId": "d3db5ac3-6ae9-42c4-aaee-874cecafe3ad"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " vector_distance | \n",
- " content | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " chunk:87 | \n",
- " 0.334264576435 | \n",
- " Asia Pacific & Latin America 1,932 1,896 2 % 1... | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " chunk:79 | \n",
- " 0.350993335247 | \n",
- " Table of Contents\\nCONSOLIDA TED OPERA TING RE... | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " chunk:86 | \n",
- " 0.371814072132 | \n",
- " Table of Contents\\nOPERA TING SEGMENTS\\nAs dis... | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " chunk:82 | \n",
- " 0.378765702248 | \n",
- " Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id vector_distance content\n",
- "0 chunk:87 0.334264576435 Asia Pacific & Latin America 1,932 1,896 2 % 1...\n",
- "1 chunk:79 0.350993335247 Table of Contents\\nCONSOLIDA TED OPERA TING RE...\n",
- "2 chunk:86 0.371814072132 Table of Contents\\nOPERA TING SEGMENTS\\nAs dis...\n",
- "3 chunk:82 0.378765702248 Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C..."
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/plain": [
+ " id vector_distance content\n",
+ "0 chunk:88 0.337694525719 Asia Pacific & Latin America 1,932 1,896 2 % 1...\n",
+ "1 chunk:80 0.34205275774 Table of Contents\\nCONSOLIDATED OPERATING RESU...\n",
+ "2 chunk:87 0.357761025429 Table of Contents\\nOPERATING SEGMENTS\\nAs disc...\n",
+ "3 chunk:83 0.378765881062 Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C..."
],
- "source": [
- "# Add filter to range query\n",
- "range_query.set_filter(text_filter)\n",
- "\n",
- "index.query(range_query)\n",
- "pd.DataFrame(result)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "zYYPTQN7T6LG"
- },
- "source": [
- "## Building a basic RAG Pipeline from Scratch\n",
- "We're going to build a basic RAG pipeline from scratch incorporating the following components:\n",
- "\n",
- "- Standard semantic search\n",
- "- Integration with OpenAI for LLM\n",
- "- Chat completion"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "rCWlVR2OT6LG"
- },
- "source": [
- "### Setup RedisVL AsyncSearchIndex"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "_esLGYzbT6LG",
- "outputId": "d3314a08-8746-4239-dcb2-e7e41b51c640"
- },
- "outputs": [],
- "source": [
- "from redisvl.index import AsyncSearchIndex\n",
- "\n",
- "async_index = AsyncSearchIndex.from_dict(schema, redis_url=REDIS_URL)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "peK4C5xGJkED"
- },
- "source": [
- "### Setup OpenAI API"
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " vector_distance | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " chunk:88 | \n",
+ " 0.337694525719 | \n",
+ " Asia Pacific & Latin America 1,932 1,896 2 % 1... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " chunk:80 | \n",
+ " 0.34205275774 | \n",
+ " Table of Contents\\nCONSOLIDATED OPERATING RESU... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " chunk:87 | \n",
+ " 0.357761025429 | \n",
+ " Table of Contents\\nOPERATING SEGMENTS\\nAs disc... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " chunk:83 | \n",
+ " 0.378765881062 | \n",
+ " Table of Contents\\nGROSS MARGIN\\nFISCAL 2023 C... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 19
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zYYPTQN7T6LG"
+ },
+ "source": [
+ "## Building a basic RAG Pipeline from Scratch\n",
+ "We're going to build a basic RAG pipeline from scratch incorporating the following components:\n",
+ "\n",
+ "- Standard semantic search\n",
+ "- Integration with OpenAI for LLM\n",
+ "- Chat completion"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rCWlVR2OT6LG"
+ },
+ "source": [
+ "### Setup RedisVL AsyncSearchIndex"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "EgdTvz6zJkED",
- "outputId": "d2ab0e8e-2ecf-458d-881d-6e4658953a71"
- },
- "outputs": [],
- "source": [
- "import openai\n",
- "import os\n",
- "import getpass\n",
- "\n",
- "\n",
- "CHAT_MODEL = \"gpt-3.5-turbo-0125\"\n",
- "\n",
- "if \"OPENAI_API_KEY\" not in os.environ:\n",
- " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OPENAI_API_KEY :\")\n"
- ]
+ "id": "_esLGYzbT6LG",
+ "outputId": "d3314a08-8746-4239-dcb2-e7e41b51c640",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:46:57.734454Z",
+ "start_time": "2025-04-24T16:46:57.732810Z"
+ }
+ },
+ "source": [
+ "from redisvl.index import AsyncSearchIndex\n",
+ "\n",
+ "async_index = AsyncSearchIndex.from_dict(schema, redis_url=REDIS_URL)"
+ ],
+ "outputs": [],
+ "execution_count": 20
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "peK4C5xGJkED"
+ },
+ "source": [
+ "### Setup OpenAI API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "w8Af-zneT6LH"
- },
- "source": [
- "### Baseline Retrieval Augmented Generation\n",
- "The code below answers a user's questions following this basic flow:\n",
- "\n",
- "1. Generate a query_vector from the user's chat question to have an apples to apples comparison against the vector database.\n",
- "2. Retrieve the most semantically relevant chunks to the user's query from the database.\n",
- "3. Pass the user query and retrieved context to the `promptify` function to generate the final prompt to be sent to the LLM along with the system prompt and necessary hyperparameters.\n",
- "4. Return the LLMs response to the user."
- ]
+ "id": "EgdTvz6zJkED",
+ "outputId": "d2ab0e8e-2ecf-458d-881d-6e4658953a71",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:12.289527Z",
+ "start_time": "2025-04-24T16:46:57.837857Z"
+ }
+ },
+ "source": [
+ "import openai\n",
+ "import os\n",
+ "import getpass\n",
+ "\n",
+ "\n",
+ "CHAT_MODEL = \"gpt-3.5-turbo-0125\"\n",
+ "\n",
+ "if \"OPENAI_API_KEY\" not in os.environ:\n",
+ " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OPENAI_API_KEY :\")\n"
+ ],
+ "outputs": [],
+ "execution_count": 21
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "w8Af-zneT6LH"
+ },
+ "source": [
+ "### Baseline Retrieval Augmented Generation\n",
+ "The code below answers a user's questions following this basic flow:\n",
+ "\n",
+ "1. Generate a query_vector from the user's chat question to have an apples to apples comparison against the vector database.\n",
+ "2. Retrieve the most semantically relevant chunks to the user's query from the database.\n",
+ "3. Pass the user query and retrieved context to the `promptify` function to generate the final prompt to be sent to the LLM along with the system prompt and necessary hyperparameters.\n",
+ "4. Return the LLMs response to the user."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "1V1Tio4-ZjmA",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:12.308509Z",
+ "start_time": "2025-04-24T16:47:12.303243Z"
+ }
+ },
+ "source": [
+ "\n",
+ "async def answer_question(index: AsyncSearchIndex, query: str):\n",
+ " \"\"\"Answer the user's question\"\"\"\n",
+ "\n",
+ " SYSTEM_PROMPT = \"\"\"You are a helpful financial analyst assistant that has access\n",
+ " to public financial 10k documents in order to answer users questions about company\n",
+ " performance, ethics, characteristics, and core information.\n",
+ " \"\"\"\n",
+ "\n",
+ " query_vector = hf.embed(query)\n",
+ " # Fetch context from Redis using vector search\n",
+ " context = await retrieve_context(index, query_vector)\n",
+ " # Generate contextualized prompt and feed to OpenAI\n",
+ " response = await openai.AsyncClient().chat.completions.create(\n",
+ " model=CHAT_MODEL,\n",
+ " messages=[\n",
+ " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+ " {\"role\": \"user\", \"content\": promptify(query, context)}\n",
+ " ],\n",
+ " temperature=0.1,\n",
+ " seed=42\n",
+ " )\n",
+ " # Response provided by LLM\n",
+ " return response.choices[0].message.content\n",
+ "\n",
+ "\n",
+ "async def retrieve_context(async_index: AsyncSearchIndex, query_vector) -> str:\n",
+ " \"\"\"Fetch the relevant context from Redis using vector search\"\"\"\n",
+ " results = await async_index.query(\n",
+ " VectorQuery(\n",
+ " vector=query_vector,\n",
+ " vector_field_name=\"text_embedding\",\n",
+ " return_fields=[\"content\"],\n",
+ " num_results=3\n",
+ " )\n",
+ " )\n",
+ " content = \"\\n\".join([result[\"content\"] for result in results])\n",
+ " return content\n",
+ "\n",
+ "\n",
+ "def promptify(query: str, context: str) -> str:\n",
+ " return f'''Use the provided context below derived from public financial\n",
+ " documents to answer the user's question. If you can't answer the user's\n",
+ " question, based on the context; do not guess. If there is no context at all,\n",
+ " respond with \"I don't know\".\n",
+ "\n",
+ " User question:\n",
+ "\n",
+ " {query}\n",
+ "\n",
+ " Helpful context:\n",
+ "\n",
+ " {context}\n",
+ "\n",
+ " Answer:\n",
+ " '''"
+ ],
+ "outputs": [],
+ "execution_count": 22
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kgVM_g01T6LP"
+ },
+ "source": [
+ "### Let's test it out..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "pn-PoACdbihY",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:12.339354Z",
+ "start_time": "2025-04-24T16:47:12.337769Z"
+ }
+ },
+ "source": [
+ "# Generate a list of questions\n",
+ "questions = [\n",
+ " \"What is the trend in the company's revenue and profit over the past few years?\",\n",
+ " \"What are the company's primary revenue sources?\",\n",
+ " \"How much debt does the company have, and what are its capital expenditure plans?\",\n",
+ " \"What does the company say about its environmental, social, and governance (ESG) practices?\",\n",
+ " \"What is the company's strategy for growth?\"\n",
+ "]"
+ ],
+ "outputs": [],
+ "execution_count": 23
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 264,
+ "referenced_widgets": [
+ "22178a562935411f88cad67659ebb7c4",
+ "18c7d5708c124911b214199fedb2b642",
+ "905bc767c24447dc96998d2c5f935776",
+ "3ad99e40e63d4443a80b2b579b32e972",
+ "648ff789b7e640978d79bb73afb8b935",
+ "d653f934619843e28c86c1548dfc6b58",
+ "9845ed85170a4ca1ac53e2e662ec9aa3",
+ "c23e1195ff58417cba20de29285b4f8d",
+ "13c9571c73de48388ffa93f602091320",
+ "52d9d383c841431198b7a53f14da59f1",
+ "ef2b758d4fc241d4becf2ff611954b7e",
+ "77c3e16292de4c0da1efe12946d59602",
+ "f699af42ec874895beb31960b5a7db38",
+ "df531bd2864648d3a3cd081f4395ea53",
+ "eaea17a6fc4e4ae08e8cdb1b894a75ee",
+ "e7653f4691f84722ac67ce2d2eea0c8c",
+ "0296317b893f4d61ba8dcd45fb02260e",
+ "d11dbe6f1f454b239104da75adde3ff4",
+ "53e352c2ac614b58a76b7ea01971b51c",
+ "6d6d0b5efd2149ada10a82e450d79a17",
+ "14433f774cab4e70a984afee44780630",
+ "d720cffbcc444daabf7105d7f46bb738",
+ "083963c0130a4e0f9f8b1123495d2c94",
+ "37f2fb1531d843ca9af8c418b156df0f",
+ "8a9447ddaef84d18b69597c77d13cdab",
+ "4be0f4750d7744bda6bdf9e09efc6e83",
+ "6f77af81f9d7483eb2d9764083a28936",
+ "a77bb82fc74643c5961ad0683719bcc7",
+ "592ad30fe72141e099335a37f2b5d65f",
+ "08a93f48e2ae40dd83c76c02dde1a581",
+ "d865aa9825cc46248db4591bd7eb8202",
+ "c06a936e3f0f4e1d98b886d7b587eb89",
+ "d193499ece3b4e81a4deda0c843d980d",
+ "3ca7831ca79940c9bb1a34b8ef8f763c",
+ "db0773b8f5864b68a2ce8357a09d8012",
+ "06ef9cbf630b445cabe4ad026642f568",
+ "6901df439dbf4b2180d24ad62e9db4f4",
+ "2db40294cdc8476bae1eebb1c85d86fa",
+ "c2a875b112014ea1a88e28fb1d887ccf",
+ "4474549702694f8e87639d19d50498fd",
+ "92480b75b5ac45e2bf7e55ce5c89daaf",
+ "ffd337d71aaf4e1c92c5b53987aa7c72",
+ "21e53784d9154c0f9e0755dd7db64b01",
+ "394450e19075459ba59f53d4f11e21c2",
+ "9d386da534e24c7fa7f26f2c7f6a2d17",
+ "fcda6a6a2e8b4df0b5540e707ad486eb",
+ "37e0240a1d0c4503afd28b0072168c15",
+ "eb4f7add5c074781b7e9d104969c3564",
+ "ffab83c3d271402197ecc4b51225411b",
+ "c7b5d06f461c4ce9a089851c75647544",
+ "c7c362eaa7ea4174b1dd64377445a4b3",
+ "38dd0aae016e4bc48026d0ee30fb807a",
+ "b0de69c2826d4a0ba34b7d7cbce4ff6e",
+ "1b2721602abf42e1bb4d29fb3605644f",
+ "fe546bd8269d48eba90fb932784eea43"
+ ]
},
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "id": "1V1Tio4-ZjmA"
- },
- "outputs": [],
- "source": [
- "\n",
- "async def answer_question(index: AsyncSearchIndex, query: str):\n",
- " \"\"\"Answer the user's question\"\"\"\n",
- "\n",
- " SYSTEM_PROMPT = \"\"\"You are a helpful financial analyst assistant that has access\n",
- " to public financial 10k documents in order to answer users questions about company\n",
- " performance, ethics, characteristics, and core information.\n",
- " \"\"\"\n",
- "\n",
- " query_vector = hf.embed(query)\n",
- " # Fetch context from Redis using vector search\n",
- " context = await retrieve_context(index, query_vector)\n",
- " # Generate contextualized prompt and feed to OpenAI\n",
- " response = await openai.AsyncClient().chat.completions.create(\n",
- " model=CHAT_MODEL,\n",
- " messages=[\n",
- " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
- " {\"role\": \"user\", \"content\": promptify(query, context)}\n",
- " ],\n",
- " temperature=0.1,\n",
- " seed=42\n",
- " )\n",
- " # Response provided by LLM\n",
- " return response.choices[0].message.content\n",
- "\n",
- "\n",
- "async def retrieve_context(async_index: AsyncSearchIndex, query_vector) -> str:\n",
- " \"\"\"Fetch the relevant context from Redis using vector search\"\"\"\n",
- " results = await async_index.query(\n",
- " VectorQuery(\n",
- " vector=query_vector,\n",
- " vector_field_name=\"text_embedding\",\n",
- " return_fields=[\"content\"],\n",
- " num_results=3\n",
- " )\n",
- " )\n",
- " content = \"\\n\".join([result[\"content\"] for result in results])\n",
- " return content\n",
- "\n",
- "\n",
- "def promptify(query: str, context: str) -> str:\n",
- " return f'''Use the provided context below derived from public financial\n",
- " documents to answer the user's question. If you can't answer the user's\n",
- " question, based on the context; do not guess. If there is no context at all,\n",
- " respond with \"I don't know\".\n",
- "\n",
- " User question:\n",
- "\n",
- " {query}\n",
- "\n",
- " Helpful context:\n",
- "\n",
- " {context}\n",
- "\n",
- " Answer:\n",
- " '''"
- ]
+ "id": "9M_iU6_hbv0J",
+ "outputId": "b9fc43d9-883a-4795-8a37-8a2f4c545892",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:20.587275Z",
+ "start_time": "2025-04-24T16:47:12.352722Z"
+ }
+ },
+ "source": [
+ "import asyncio\n",
+ "\n",
+ "results = await asyncio.gather(*[\n",
+ " answer_question(async_index, question) for question in questions\n",
+ "])"
+ ],
+ "outputs": [],
+ "execution_count": 24
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CpQ59SRgJkED"
+ },
+ "source": [
+ "### Let's view the results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "7SZM_xg3b9Gb",
+ "outputId": "758ae31a-2291-4191-aa57-ee941d3319cb",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:20.604843Z",
+ "start_time": "2025-04-24T16:47:20.602566Z"
+ }
+ },
+ "source": [
+ "for i, r in enumerate(results):\n",
+ " print(f\"Question: {questions[i]}\")\n",
+ " print(f\"Answer: \\n {r}\", \"\\n-----------\\n\")"
+ ],
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "kgVM_g01T6LP"
- },
- "source": [
- "### Let's test it out..."
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Question: What is the trend in the company's revenue and profit over the past few years?\n",
+ "Answer: \n",
+ " The trend in the company's revenue and profit over the past few years is as follows:\n",
+ "\n",
+ "- Revenue:\n",
+ " - Fiscal Year 2023: Total revenue for Nike, Inc. was $51,217 million, showing a 10% increase from the previous year.\n",
+ " - Fiscal Year 2022: Total revenue for Nike, Inc. was $46,710 million, showing a 10% increase from the year before.\n",
+ " - Fiscal Year 2021: Total revenue for Nike, Inc. was $44,538 million.\n",
+ "\n",
+ "- Profit (EBIT):\n",
+ " - Fiscal Year 2023: EBIT for Nike, Inc. was not provided in the context.\n",
+ " - Fiscal Year 2022: EBIT for Nike, Inc. was not provided in the context.\n",
+ " - Fiscal Year 2021: EBIT for Nike, Inc. was not provided in the context.\n",
+ "\n",
+ "Based on the revenue figures provided, there has been a consistent increase in revenue for Nike, Inc. over the past few years. However, without the EBIT figures, we cannot determine the trend in profit over the same period. \n",
+ "-----------\n",
+ "\n",
+ "Question: What are the company's primary revenue sources?\n",
+ "Answer: \n",
+ " The company's primary revenue sources are as follows:\n",
+ "\n",
+ "1. Footwear\n",
+ "2. Apparel\n",
+ "3. Equipment\n",
+ "4. Other (including licensing and miscellaneous revenues)\n",
+ "\n",
+ "These revenues are further broken down by sales to wholesale customers, sales through direct to consumer channels, and other sources. \n",
+ "-----------\n",
+ "\n",
+ "Question: How much debt does the company have, and what are its capital expenditure plans?\n",
+ "Answer: \n",
+ " The company has a total long-term debt of $8,927 million as of May 31, 2023. The capital expenditure plans are not explicitly mentioned in the provided context. \n",
+ "-----------\n",
+ "\n",
+ "Question: What does the company say about its environmental, social, and governance (ESG) practices?\n",
+ "Answer: \n",
+ " The company acknowledges the increased focus on sustainability matters, responsible sourcing, deforestation, energy and water usage, and packaging recyclability. They mention that complying with legislative and regulatory initiatives related to climate change may increase costs and complexity. The company has announced sustainability-related goals and targets, but there are risks and uncertainties associated with achieving them. They highlight that failure to meet these goals or respond to new legal requirements could result in adverse publicity and impact their business and reputation. \n",
+ "-----------\n",
+ "\n",
+ "Question: What is the company's strategy for growth?\n",
+ "Answer: \n",
+ " Based on the provided financial data, it appears that the company's strategy for growth includes focusing on expanding its revenues across different geographic regions and product lines. The company has shown consistent growth in revenues over the years, with increases in all major segments such as North America, Europe, Middle East & Africa, Greater China, and Asia Pacific & Latin America. Additionally, the company has been investing in property, plant, and equipment to support its growth, as evidenced by the increasing additions to these assets over the years. Furthermore, the company's strategy includes a mix of sales to wholesale customers and direct-to-consumer sales channels to drive revenue growth. \n",
+ "-----------\n",
+ "\n"
+ ]
+ }
+ ],
+ "execution_count": 25
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "### Improve performance and cut costs with LLM caching"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:20.654925Z",
+ "start_time": "2025-04-24T16:47:20.639324Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "from redisvl.extensions.llmcache import SemanticCache\n",
+ "\n",
+ "llmcache = SemanticCache(\n",
+ " name=\"llmcache\",\n",
+ " vectorizer=hf,\n",
+ " redis_url=REDIS_URL,\n",
+ " ttl=120,\n",
+ " distance_threshold=0.2,\n",
+ " overwrite=True,\n",
+ ")"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {
- "id": "pn-PoACdbihY"
- },
- "outputs": [],
- "source": [
- "# Generate a list of questions\n",
- "questions = [\n",
- " \"What is the trend in the company's revenue and profit over the past few years?\",\n",
- " \"What are the company's primary revenue sources?\",\n",
- " \"How much debt does the company have, and what are its capital expenditure plans?\",\n",
- " \"What does the company say about its environmental, social, and governance (ESG) practices?\",\n",
- " \"What is the company's strategy for growth?\"\n",
- "]"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "09:47:20 redisvl.index.index INFO Index already exists, overwriting.\n"
+ ]
+ }
+ ],
+ "execution_count": 26
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:20.666720Z",
+ "start_time": "2025-04-24T16:47:20.664080Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "from functools import wraps\n",
+ "\n",
+ "# Create an LLM caching decorator\n",
+ "def cache(func):\n",
+ " @wraps(func)\n",
+ " async def wrapper(index, query_text, *args, **kwargs):\n",
+ " query_vector = llmcache._vectorizer.embed(query_text)\n",
+ "\n",
+ " # Check the cache with the vector\n",
+ " if result := llmcache.check(vector=query_vector):\n",
+ " print(\"Cache hit!\")\n",
+ " return result[0]['response']\n",
+ "\n",
+ " response = await func(index, query_text, query_vector=query_vector)\n",
+ " llmcache.store(query_text, response, query_vector)\n",
+ " return response\n",
+ " return wrapper\n",
+ "\n",
+ "\n",
+ "@cache\n",
+ "async def answer_question(index: AsyncSearchIndex, query: str, **kwargs):\n",
+ " \"\"\"Answer the user's question\"\"\"\n",
+ "\n",
+ " SYSTEM_PROMPT = \"\"\"You are a helpful financial analyst assistant that has access\n",
+ " to public financial 10k documents in order to answer users questions about company\n",
+ " performance, ethics, characteristics, and core information.\n",
+ " \"\"\"\n",
+ "\n",
+ " context = await retrieve_context(index, kwargs[\"query_vector\"])\n",
+ " response = await openai.AsyncClient().chat.completions.create(\n",
+ " model=CHAT_MODEL,\n",
+ " messages=[\n",
+ " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+ " {\"role\": \"user\", \"content\": promptify(query, context)}\n",
+ " ],\n",
+ " temperature=0.1,\n",
+ " seed=42\n",
+ " )\n",
+ " # Response provided by GPT-3.5\n",
+ " return response.choices[0].message.content"
+ ],
+ "outputs": [],
+ "execution_count": 27
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:21.577338Z",
+ "start_time": "2025-04-24T16:47:20.691181Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# NBVAL_SKIP\n",
+ "query = \"What was Nike's revenue last year compared to this year??\"\n",
+ "\n",
+ "await answer_question(async_index, query)"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 264,
- "referenced_widgets": [
- "22178a562935411f88cad67659ebb7c4",
- "18c7d5708c124911b214199fedb2b642",
- "905bc767c24447dc96998d2c5f935776",
- "3ad99e40e63d4443a80b2b579b32e972",
- "648ff789b7e640978d79bb73afb8b935",
- "d653f934619843e28c86c1548dfc6b58",
- "9845ed85170a4ca1ac53e2e662ec9aa3",
- "c23e1195ff58417cba20de29285b4f8d",
- "13c9571c73de48388ffa93f602091320",
- "52d9d383c841431198b7a53f14da59f1",
- "ef2b758d4fc241d4becf2ff611954b7e",
- "77c3e16292de4c0da1efe12946d59602",
- "f699af42ec874895beb31960b5a7db38",
- "df531bd2864648d3a3cd081f4395ea53",
- "eaea17a6fc4e4ae08e8cdb1b894a75ee",
- "e7653f4691f84722ac67ce2d2eea0c8c",
- "0296317b893f4d61ba8dcd45fb02260e",
- "d11dbe6f1f454b239104da75adde3ff4",
- "53e352c2ac614b58a76b7ea01971b51c",
- "6d6d0b5efd2149ada10a82e450d79a17",
- "14433f774cab4e70a984afee44780630",
- "d720cffbcc444daabf7105d7f46bb738",
- "083963c0130a4e0f9f8b1123495d2c94",
- "37f2fb1531d843ca9af8c418b156df0f",
- "8a9447ddaef84d18b69597c77d13cdab",
- "4be0f4750d7744bda6bdf9e09efc6e83",
- "6f77af81f9d7483eb2d9764083a28936",
- "a77bb82fc74643c5961ad0683719bcc7",
- "592ad30fe72141e099335a37f2b5d65f",
- "08a93f48e2ae40dd83c76c02dde1a581",
- "d865aa9825cc46248db4591bd7eb8202",
- "c06a936e3f0f4e1d98b886d7b587eb89",
- "d193499ece3b4e81a4deda0c843d980d",
- "3ca7831ca79940c9bb1a34b8ef8f763c",
- "db0773b8f5864b68a2ce8357a09d8012",
- "06ef9cbf630b445cabe4ad026642f568",
- "6901df439dbf4b2180d24ad62e9db4f4",
- "2db40294cdc8476bae1eebb1c85d86fa",
- "c2a875b112014ea1a88e28fb1d887ccf",
- "4474549702694f8e87639d19d50498fd",
- "92480b75b5ac45e2bf7e55ce5c89daaf",
- "ffd337d71aaf4e1c92c5b53987aa7c72",
- "21e53784d9154c0f9e0755dd7db64b01",
- "394450e19075459ba59f53d4f11e21c2",
- "9d386da534e24c7fa7f26f2c7f6a2d17",
- "fcda6a6a2e8b4df0b5540e707ad486eb",
- "37e0240a1d0c4503afd28b0072168c15",
- "eb4f7add5c074781b7e9d104969c3564",
- "ffab83c3d271402197ecc4b51225411b",
- "c7b5d06f461c4ce9a089851c75647544",
- "c7c362eaa7ea4174b1dd64377445a4b3",
- "38dd0aae016e4bc48026d0ee30fb807a",
- "b0de69c2826d4a0ba34b7d7cbce4ff6e",
- "1b2721602abf42e1bb4d29fb3605644f",
- "fe546bd8269d48eba90fb932784eea43"
- ]
- },
- "id": "9M_iU6_hbv0J",
- "outputId": "b9fc43d9-883a-4795-8a37-8a2f4c545892"
- },
- "outputs": [],
- "source": [
- "import asyncio\n",
- "\n",
- "results = await asyncio.gather(*[\n",
- " answer_question(async_index, question) for question in questions\n",
- "])"
+ "data": {
+ "text/plain": [
+ "\"Nike's total revenues were $51.2 billion in fiscal year 2023, compared to $46.7 billion in fiscal year 2022. This represents a 10% increase in revenue from the previous year.\""
]
- },
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 28
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:21.606033Z",
+ "start_time": "2025-04-24T16:47:21.590864Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# NBVAL_SKIP\n",
+ "query = \"What was Nike's total revenue in the last year compared to now??\"\n",
+ "\n",
+ "await answer_question(async_index, query)\n",
+ "\n",
+ "# notice no HTTP request to OpenAI since this question is \"close enough\" to the last one"
+ ],
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "CpQ59SRgJkED"
- },
- "source": [
- "### Let's view the results"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Cache hit!\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "7SZM_xg3b9Gb",
- "outputId": "758ae31a-2291-4191-aa57-ee941d3319cb"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Question: What is the trend in the company's revenue and profit over the past few years?\n",
- "Answer: \n",
- " The trend in the company's revenue and profit over the past few years shows a consistent increase. \n",
- "\n",
- "- Total revenues for NIKE, Inc. have increased from $44,538 million in fiscal year 2021 to $46,710 million in fiscal year 2022, and further to $51,217 million in fiscal year 2023.\n",
- "- Total Earnings Before Interest and Taxes (EBIT) have also shown growth, with EBIT increasing in most segments from fiscal year 2022 to fiscal year 2023.\n",
- "\n",
- "Overall, based on the provided financial data, the company has experienced a positive trend in both revenue and profit over the past few years. \n",
- "-----------\n",
- "\n",
- "Question: What are the company's primary revenue sources?\n",
- "Answer: \n",
- " The company's primary revenue sources are as follows:\n",
- "\n",
- "1. Footwear\n",
- "2. Apparel\n",
- "3. Equipment\n",
- "\n",
- "These revenue sources are further broken down by geographical segments and distribution channels. \n",
- "-----------\n",
- "\n",
- "Question: How much debt does the company have, and what are its capital expenditure plans?\n",
- "Answer: \n",
- " The company has a total long-term debt of $8,927 million as of May 31, 2023. The capital expenditure plans are not explicitly mentioned in the provided context. \n",
- "-----------\n",
- "\n",
- "Question: What does the company say about its environmental, social, and governance (ESG) practices?\n",
- "Answer: \n",
- " The company acknowledges the increased focus on sustainability matters, responsible sourcing, deforestation, energy and water usage, and the recyclability of packaging and materials. They mention that complying with legislative and regulatory initiatives related to climate change may lead to increased costs and complexity. The company has announced sustainability-related goals and targets, but there are risks and uncertainties associated with achieving them. They highlight that failure to meet these goals or respond effectively to new legal requirements could result in adverse publicity and impact their business and reputation. \n",
- "-----------\n",
- "\n",
- "Question: What is the company's strategy for growth?\n",
- "Answer: \n",
- " Based on the provided financial information, it appears that NIKE's strategy for growth includes focusing on expanding its operations in different geographic regions. The company's revenue breakdown shows growth in various segments such as North America, Europe, Middle East & Africa, and Asia Pacific & Latin America. Additionally, NIKE seems to be investing in property, plant, and equipment across different regions, indicating a commitment to expanding its physical infrastructure to support growth. Furthermore, the company's emphasis on Earnings Before Interest and Taxes (EBIT) as a key performance measure suggests a focus on improving profitability and operational efficiency to drive growth. \n",
- "-----------\n",
- "\n"
- ]
- }
- ],
- "source": [
- "for i, r in enumerate(results):\n",
- " print(f\"Question: {questions[i]}\")\n",
- " print(f\"Answer: \\n {r}\", \"\\n-----------\\n\")"
+ "data": {
+ "text/plain": [
+ "\"Nike's total revenues were $51.2 billion in fiscal year 2023, compared to $46.7 billion in fiscal year 2022. This represents a 10% increase in revenue from the previous year.\""
]
- },
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 29
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": [
+ "### Improve personalization by including chat session history\n",
+ "\n",
+ "To preserve state in the conversation, offload conversation history to a database that can handle high transaction throughput for writes/reads to limit system latency.\n"
+ ]
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:21.642412Z",
+ "start_time": "2025-04-24T16:47:21.634222Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "from functools import wraps\n",
+ "from redisvl.extensions.session_manager import StandardSessionManager\n",
+ "\n",
+ "\n",
+ "class ChatBot:\n",
+ " def __init__(self, index: AsyncSearchIndex, vectorizer: BaseVectorizer, user: str):\n",
+ " self.index = index\n",
+ " self.vectorizer = vectorizer\n",
+ " self.session_manager = StandardSessionManager(\n",
+ " name=f\"chat_session_{user}\",\n",
+ " session_tag=user,\n",
+ " redis_url=REDIS_URL,\n",
+ " )\n",
+ "\n",
+ " @staticmethod\n",
+ " def promptify(query: str, context: str) -> str:\n",
+ " return f'''Use the provided context below derived from public financial\n",
+ " documents to answer the user's question. If you can't answer the user's\n",
+ " question, based on the context; do not guess. If there is no context at all,\n",
+ " respond with \"I don't know\".\n",
+ "\n",
+ " User question:\n",
+ "\n",
+ " {query}\n",
+ "\n",
+ " Helpful context:\n",
+ "\n",
+ " {context}\n",
+ "\n",
+ " Answer:\n",
+ " '''\n",
+ "\n",
+ " async def retrieve_context(self, query_vector) -> str:\n",
+ " \"\"\"Fetch the relevant context from Redis using vector search\"\"\"\n",
+ " results = await self.index.query(\n",
+ " VectorQuery(\n",
+ " vector=query_vector,\n",
+ " vector_field_name=\"text_embedding\",\n",
+ " return_fields=[\"content\"],\n",
+ " num_results=3\n",
+ " )\n",
+ " )\n",
+ " content = \"\\n\".join([result[\"content\"] for result in results])\n",
+ " return content\n",
+ "\n",
+ " async def clear_history(self):\n",
+ " \"\"\"Clear session chat\"\"\"\n",
+ " self.session_manager.clear()\n",
+ "\n",
+ " async def answer_question(self, query: str):\n",
+ " \"\"\"Answer the user's question with historical context and caching baked-in\"\"\"\n",
+ "\n",
+ " SYSTEM_PROMPT = \"\"\"You are a helpful financial analyst assistant that has access\n",
+ " to public financial 10k documents in order to answer users questions about company\n",
+ " performance, ethics, characteristics, and core information.\n",
+ " \"\"\"\n",
+ "\n",
+ " # Create query vector\n",
+ " query_vector = self.vectorizer.embed(query)\n",
+ "\n",
+ " # Check the cache with the vector\n",
+ " if result := llmcache.check(vector=query_vector):\n",
+ " answer = result[0]['response']\n",
+ " else:\n",
+ " context = await self.retrieve_context(query_vector)\n",
+ " session = self.session_manager.messages\n",
+ " messages = (\n",
+ " [{\"role\": \"system\", \"content\": SYSTEM_PROMPT}] +\n",
+ " session +\n",
+ " [{\"role\": \"user\", \"content\": self.promptify(query, context)}]\n",
+ " )\n",
+ " # Response provided by GPT-3.5\n",
+ " response = await openai.AsyncClient().chat.completions.create(\n",
+ " model=CHAT_MODEL,\n",
+ " messages=messages,\n",
+ " temperature=0.1,\n",
+ " seed=42\n",
+ " )\n",
+ " answer = response.choices[0].message.content\n",
+ " llmcache.store(query, answer, query_vector)\n",
+ "\n",
+ " # Add message history\n",
+ " self.session_manager.add_messages([\n",
+ " {\"role\": \"user\", \"content\": query},\n",
+ " {\"role\": \"assistant\", \"content\": answer}\n",
+ " ])\n",
+ "\n",
+ " return answer"
+ ],
+ "outputs": [],
+ "execution_count": 30
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "## Test the entire RAG workflow"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:21.669248Z",
+ "start_time": "2025-04-24T16:47:21.663308Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# Setup Session\n",
+ "chat = ChatBot(async_index, vectorizer=hf, user=\"Andrew\")\n",
+ "await chat.clear_history()"
+ ],
+ "outputs": [],
+ "execution_count": 31
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:27.496044Z",
+ "start_time": "2025-04-24T16:47:21.702428Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# Run a simple chat\n",
+ "stopterms = [\"exit\", \"quit\", \"end\", \"cancel\"]\n",
+ "\n",
+ "# Simple Chat\n",
+ "# NBVAL_SKIP\n",
+ "while True:\n",
+ " user_query = input()\n",
+ " if user_query.lower() in stopterms or not user_query:\n",
+ " break\n",
+ " answer = await chat.answer_question(user_query)\n",
+ " print(answer, flush=True)"
+ ],
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "D_eiWikCJkED"
- },
- "source": [
- "# You now have a working RAG pipeline!\n",
- "\n",
- "As you can see, it is easy to get started with RAG and we were able to get decent chat results from this simple setup. To go beyond the basic example though see the [advanced_rag](./04_advanced_redisvl.ipynb) notebook.\n",
- "\n",
- "This notebook covers:\n",
- "\n",
- "- **Improving accuracy** with dense content representations and query rewriting/expansion\n",
- "- **Improving performance and optimizing cost** with semantic caching\n",
- "- **Improving personalization** with chat session memory.\n"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Hi! How can I assist you today?\n"
+ ]
+ }
+ ],
+ "execution_count": 32
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:27.527276Z",
+ "start_time": "2025-04-24T16:47:27.522755Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# NBVAL_SKIP\n",
+ "chat.session_manager.messages"
+ ],
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "Wscs4Mvo1ujD"
- },
- "source": [
- "## Cleanup\n",
- "\n",
- "Clean up the database."
+ "data": {
+ "text/plain": [
+ "[{'role': 'user', 'content': 'hi'},\n",
+ " {'role': 'assistant', 'content': 'Hi! How can I assist you today?'}]"
]
- },
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 33
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "D_eiWikCJkED"
+ },
+ "source": [
+ "# You now have a working RAG pipeline!\n",
+ "\n",
+ "As you can see, it is easy to get started with RAG and we were able to get decent chat results from this simple setup. To go beyond the basic example though see the [advanced_rag](./04_advanced_redisvl.ipynb) notebook.\n",
+ "\n",
+ "This notebook covers:\n",
+ "\n",
+ "- **Improving accuracy** with dense content representations and query rewriting/expansion\n",
+ "- **Improving performance and optimizing cost** with semantic caching\n",
+ "- **Improving personalization** with chat session memory.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Wscs4Mvo1ujD"
+ },
+ "source": [
+ "## Cleanup\n",
+ "\n",
+ "Clean up the database."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "On6yNuQn1ujD",
+ "ExecuteTime": {
+ "end_time": "2025-04-24T16:47:34.042787Z",
+ "start_time": "2025-04-24T16:47:34.036106Z"
+ }
+ },
+ "source": [
+ "await async_index.client.flushall()"
+ ],
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "id": "On6yNuQn1ujD"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "await async_index.client.flushall()"
+ "data": {
+ "text/plain": [
+ "True"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "gpuType": "T4",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.9"
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
}
+ ],
+ "execution_count": 37
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "env",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/python-recipes/vector-search/01_redisvl.ipynb b/python-recipes/vector-search/01_redisvl.ipynb
index 28e893d4..10a8fb87 100644
--- a/python-recipes/vector-search/01_redisvl.ipynb
+++ b/python-recipes/vector-search/01_redisvl.ipynb
@@ -8,7 +8,8 @@
},
"source": [
"\n",
- "# Vector Search with Redisvl\n",
+ "# Vector Search with RedisVL\n",
+ "\n",
"## Let's Begin!\n",
"
\n"
]
@@ -22,9 +23,9 @@
"source": [
"## Prepare data\n",
"\n",
- "In this examples we will load a list of movie objects with the following attributes: `title`, `rating`, `description`, and `genre`.\n",
+ "In this examples we will load a list of movies with the following attributes: `title`, `rating`, `description`, and `genre`.\n",
"\n",
- "For the vector part of our vector search we will embed the description so that user's can search for movies that best match what they're looking for.\n",
+ "We will embed the movie description so that user's can search for movies that best match the kind of movie that they're looking for.\n",
"\n",
"**If you are running this notebook locally**, FYI you may not need to perform this step at all."
]
@@ -38,7 +39,7 @@
"base_uri": "https://localhost:8080/"
},
"id": "b966a9b5",
- "outputId": "61565924-8e01-4411-fac7-82346bb10e87"
+ "outputId": "8fb1aed9-94a3-47b2-af50-4eac9b08d7f1"
},
"outputs": [
{
@@ -46,12 +47,12 @@
"output_type": "stream",
"text": [
"Cloning into 'temp_repo'...\n",
- "remote: Enumerating objects: 384, done.\u001b[K\n",
- "remote: Counting objects: 100% (247/247), done.\u001b[K\n",
- "remote: Compressing objects: 100% (159/159), done.\u001b[K\n",
- "remote: Total 384 (delta 135), reused 153 (delta 74), pack-reused 137 (from 1)\u001b[K\n",
- "Receiving objects: 100% (384/384), 64.50 MiB | 15.56 MiB/s, done.\n",
- "Resolving deltas: 100% (159/159), done.\n"
+ "remote: Enumerating objects: 669, done.\u001b[K\n",
+ "remote: Counting objects: 100% (320/320), done.\u001b[K\n",
+ "remote: Compressing objects: 100% (207/207), done.\u001b[K\n",
+ "remote: Total 669 (delta 219), reused 141 (delta 112), pack-reused 349 (from 2)\u001b[K\n",
+ "Receiving objects: 100% (669/669), 57.77 MiB | 20.61 MiB/s, done.\n",
+ "Resolving deltas: 100% (287/287), done.\n"
]
}
],
@@ -74,30 +75,14 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "c620286e",
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "c620286e",
- "outputId": "d69d35a0-29b2-4a9c-aa13-acf27d85a414"
+ "id": "c620286e"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/261.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/96.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.1/96.1 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25h"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "%pip install -q redis \"redisvl>=0.4.1\" numpy sentence-transformers pandas"
+ "%pip install -q \"redisvl==0.5.2\" sentence-transformers pandas nltk"
]
},
{
@@ -119,25 +104,12 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"id": "2cb85a99",
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "2cb85a99",
- "outputId": "70660a1f-9d1c-408b-f7a5-5981054fabc3"
+ "id": "2cb85a99"
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb jammy main\n",
- "Starting redis-stack-server, database path /var/lib/redis-stack\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# NBVAL_SKIP\n",
"%%sh\n",
@@ -177,7 +149,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 1,
"id": "aefda1d1",
"metadata": {
"id": "aefda1d1"
@@ -185,6 +157,9 @@
"outputs": [],
"source": [
"import os\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings('ignore')\n",
"\n",
"# Replace values below with your own if using Redis Cloud instance\n",
"REDIS_HOST = os.getenv(\"REDIS_HOST\", \"localhost\") # ex: \"redis-18374.c253.us-central1-1.gce.cloud.redislabs.com\"\n",
@@ -207,322 +182,102 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 48,
"id": "370c1fcc",
"metadata": {
- "id": "370c1fcc"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "370c1fcc",
+ "outputId": "2b5297c6-83b7-468f-b2ac-c47acf13ba2e"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from redis import Redis\n",
"\n",
- "client = Redis.from_url(REDIS_URL)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "jCXiuk9ZTN_K",
- "metadata": {
- "id": "jCXiuk9ZTN_K"
- },
- "source": [
- "### Load Data"
+ "client = Redis.from_url(REDIS_URL)\n",
+ "client.ping()"
]
},
{
"cell_type": "code",
- "execution_count": 5,
- "id": "8d561462",
+ "execution_count": 4,
+ "id": "H4w8c3Bevzq4",
"metadata": {
"colab": {
- "base_uri": "https://localhost:8080/",
- "height": 206
+ "base_uri": "https://localhost:8080/"
},
- "id": "8d561462",
- "outputId": "04daf079-cd07-4369-b6ac-5c192b75163c"
+ "id": "H4w8c3Bevzq4",
+ "outputId": "a4d3b9a4-adda-436e-9aef-b4b0120720ab"
},
"outputs": [
{
"data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " title | \n",
- " genre | \n",
- " rating | \n",
- " description | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Explosive Pursuit | \n",
- " action | \n",
- " 7 | \n",
- " A daring cop chases a notorious criminal acros... | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Skyfall | \n",
- " action | \n",
- " 8 | \n",
- " James Bond returns to track down a dangerous n... | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Fast & Furious 9 | \n",
- " action | \n",
- " 6 | \n",
- " Dom and his crew face off against a high-tech ... | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Black Widow | \n",
- " action | \n",
- " 7 | \n",
- " Natasha Romanoff confronts her dark past and f... | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " John Wick | \n",
- " action | \n",
- " 8 | \n",
- " A retired hitman seeks vengeance against those... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
"text/plain": [
- " title genre rating \\\n",
- "0 Explosive Pursuit action 7 \n",
- "1 Skyfall action 8 \n",
- "2 Fast & Furious 9 action 6 \n",
- "3 Black Widow action 7 \n",
- "4 John Wick action 8 \n",
- "\n",
- " description \n",
- "0 A daring cop chases a notorious criminal acros... \n",
- "1 James Bond returns to track down a dangerous n... \n",
- "2 Dom and his crew face off against a high-tech ... \n",
- "3 Natasha Romanoff confronts her dark past and f... \n",
- "4 A retired hitman seeks vengeance against those... "
+ "True"
]
},
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import json\n",
- "\n",
- "df = pd.read_json(\"resources/movies.json\")\n",
- "df.head()\n"
+ "#client.flushall()"
]
},
{
- "cell_type": "code",
- "execution_count": 3,
- "id": "bfiTJovpQX90",
+ "cell_type": "markdown",
+ "id": "jCXiuk9ZTN_K",
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 595,
- "referenced_widgets": [
- "c207869558b94b008ba87bfd5d8b8436",
- "3ca57c1527a3433f8bfe2fa16fe34eb9",
- "3d76cb90a08a4c1f89758662d0174968",
- "d1b5068839cc4898a8c0267fa3d8a998",
- "12622ec52f4d46b195e50f912c7766a4",
- "3769c090432b4c648dcff7977b13ddd3",
- "f6fae3370a964cb78748f21db6e82d6a",
- "eab1013bb0814e47af3a5eb9eb615d1b",
- "2aa1ee46d71043f592799d84fcb79f8a",
- "3eb907c5cc88489883ef49ba45cc0e74",
- "720b9e53ce8c45f19092520ce81cff58",
- "f6d3a567ea4847be842e9d3ef10a4be9",
- "b59ee86804e54bcdac7bf6ae89e37cd1",
- "994141230a5a4635b8eca86c532e6745",
- "09565c9c1fb54d9181d43b1e3a2de97c",
- "182c97d1d4bd4ce39f11583d8be6f91d",
- "52cd4455eddd41e9b25d67ba664f253c",
- "0849a563f98e406fa50a3da03fd720d8",
- "3e07c28c8ca24ab09b602ef999e343fa",
- "7854bf2f856a4fc4896589ff9b9fadf9",
- "14be9bc7749c4b80a878649a36b3c182",
- "89aace18dec547c6a72e9c94dfc699c4",
- "8cf7515bcfe74908bc49e43b6e1a785d",
- "9170c69e1ca84b1d9de472a2c4c16ec0",
- "41232544b8ed4891a19d3ef64c6784f3",
- "ff8d7b71d06446ccbf5a63267190e3c6",
- "be8e6cf774604445a70f8f73217cd9ff",
- "88734d53c101470689ee413391326b4e",
- "f1ad2342eeb740c59d7bd29575a443ea",
- "bf01cc1cbf5a4ea3ae29f5894008f47c",
- "ede7e3629ac54bae973d3b786ad4b1cc",
- "aff5bf955a0242968996d236a7a3a14f",
- "8c9e8dd222db4983b1807aeaea0f13c9",
- "5e393ba73ae04a42b8b134422357a07e",
- "a58e0587e65c45b2bf94e8b5fed2c425",
- "bca910554bd644d3ac391ed523347b5b",
- "2c51ca0706b34c789942eb0c45bb8db4",
- "f11230bf2c284086b718161edda43789",
- "47df84f72e5f4275b29b02441666bb27",
- "52803b2a17934066a68992b4e1cd15f3",
- "7af1a52f51394a7aabbf07590ac7b006",
- "5e17e4e52dbd4147a30db81b0ee88ed7",
- "fd6f6fa57ffa42df895ab20572b627f6",
- "07b42039c9db49dea8a5cd5d52fb2a11",
- "48c102eeab6c41648ef11da7a01b6d2c",
- "4d6c743bb3404b5f974cd8d5db5e6fe3",
- "66c2aa45255d45da832ad4379685c36a",
- "0d2a9e52601e4984b674a4540ba370a0",
- "89943f3298844e6f847d548c53cd274f",
- "48b2e4809a6a491394e6a2b3ba4a3542",
- "1ddee34844724d528d8b981c3a2b3826",
- "19dcaa26e43a40b4a93a7395939a32f7",
- "61537a7b9e9f4664b40250ae937efc1c",
- "97127ea1dd0e4399b3ca971cb0d5aabf",
- "7df695ae6ab94df6ad8f63ac19aeb2a2",
- "21ec1238d59f445eaf9158db639197b6",
- "e2998bf92df44a8086baffa2e46cc8dd",
- "4d92bf9691c24ee98aaadce02a28b90e",
- "934964e932c54d83afce011d83e17586",
- "95e35e5d88ab42dcb304880814a0402f",
- "7ad579b5e8784f89827a3feb3a2e84c5",
- "dbd292f09dad4dd58fec8e800819042f",
- "69fe6e65e5e64e17a5bc638b91df00ac",
- "f7a3592ecc894abda6615095e06e6187",
- "1d61dcfedd8c413c86be0eb8d86342ba",
- "764798711665490b8d0144b4063535cc",
- "ba490f372a344542a090605ee9f32709",
- "fa9528ae65054a4aa29c7b8fec6168d2",
- "c3ab6754c3b64719b9f3cb9bc4c84a35",
- "2f77936dad444d9cbeda5b88529259f4",
- "8ee1c814a7914deaa8e565459a9e2e81",
- "06bf2fa2babb4258957e258900431608",
- "4cad0cc057464c4f83db83578c638a90",
- "cb6a2cd5a1014284af160e25241553a0",
- "fde1a0198e074b289ce3107e3275797e",
- "7767e3828d3b4f7bb43f02dfd4457b6d",
- "f39ed58a258c4d9bba15d815853b5eac",
- "7f26d9a697164560b41a4bad4e9d79c4",
- "8f3541e50cf44f32bf5102a4059f447d",
- "16ef64b20d5f4bffafbf30a494a42483",
- "b57f9b7b82ef4c43b2214d6a2b37b8d5",
- "7785eb78520348e5825c6f8a18002db5",
- "0ceb53edb3ed4589ba863a7e9bcf4a19",
- "7962ef186edd4ce58c2b681282bdf934",
- "4aac9b601df9402a9f986128a046618a",
- "1284c382b9684641b8eb088b9b09c475",
- "a63f45fdb1a84dd6991bd73a7077683b",
- "e2286bf1de154344a5271c1d4b9df4fc",
- "4f98a16386b34805a03bbc5faad31670",
- "0bf70e1726794c8aa1904229588f5238",
- "2aa06186d6d143ebba8c2c88e2fd57e6",
- "e422b19c671649cabcb4bcfc3033520f",
- "957da1a1be8c4b399a2ba831097e3d0f",
- "87fcadef861745bba208a7eedb2c6873",
- "8237e1e9afde439dba15f1bbcadec1e0",
- "b5b6693718dd45e083e19a5e74a977d8",
- "df0fb90997154cb1abef80ed9b4d6d71",
- "1a23894a56c34028813b3a02d93d50a9",
- "277ed164e19a4b59b88504cc002b3357",
- "7bde36f5d0994a559a4ec662c5e6749e",
- "b9c36a7ce429457d9bf4b57dd80e0f1c",
- "82e6460e421a4418b5a9dc3111f51ca8",
- "9796a30107e84cb3a6fc145068e448b1",
- "d7eab1938929460b91b8bab394cda312",
- "f951b9cdea4b49ceb8e41f427440d644",
- "b8dba133b3504b9ca849554af1be6883",
- "91d09bd1e5ab4ef9871cd5730ae9bb47",
- "0ed92763980a4a629c6753f86a8722ae",
- "ebe3eda6a5664b62833451d3357c63e7",
- "798584c2955744b2a5efc548e140fe72",
- "e7c31aaa918d46f09964cd6757df9f14",
- "14189e3afb0f4adab8c64182626fe376",
- "90cc3b37fab446b3958e29ea3f7fe830",
- "b8650e4803004e8c92f3f0a4b13d2de6",
- "88202fa95c024539b6bf25f2aeb410e4",
- "8f3adf792c184c5d8929124f889ddf7f",
- "b26228621b89415a8b62f8046a22a2eb",
- "c54612edcaee4d44aac54ba43b62d307",
- "75d2511e2ccc4fdf9f9a8ff3b4da4b5c",
- "e91f37d2cc0746c9ac6c7146d0922c60",
- "6f13021672f54937bdd7736d0de9e3d0",
- "b31cd4f334134bc5bbdcb7350958f297",
- "3bcceb8dbbde430abce3f8105dcc889d",
- "70bf8f37ba964e378fd92008e050b621",
- "747a1362caf44815983d694119266e2f",
- "6249cff80cf544108a3fedcae1ad5100",
- "683a2ce0c9914394aefb7c41ee9a3ad7",
- "fc6bfeb067bd4de7995f3e11289fff9e",
- "53b79d20a8c94f9d8c1700ac2c776257",
- "81cb9416c7884bf2a6cac1f528365006",
- "b45730819e204cee85b595b2c76dcf2a",
- "2159421296f04b5fbb0216843ff304cd"
- ]
- },
- "id": "bfiTJovpQX90",
- "outputId": "74602d14-31cb-48a1-84bd-5b31dc6c04e2"
+ "id": "jCXiuk9ZTN_K"
},
- "outputs": [],
"source": [
- "from redisvl.utils.vectorize import HFTextVectorizer\n",
- "\n",
- "hf = HFTextVectorizer(\"sentence-transformers/all-MiniLM-L6-v2\")\n",
- "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n"
+ "### Load Movies Dataset"
]
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "Vl3SehnxQvXo",
+ "execution_count": 49,
+ "id": "8d561462",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 238,
- "referenced_widgets": [
- "19423ed277ec499fa9357086bbbb1c52",
- "f1479cb84d7b4be89b14d69b2907d0ea",
- "01f5386d347c42b1be6ad0ef5bb2b6c7",
- "62753da3cd114256880b6d117422035b",
- "3a6d421b0b344228ae9844f2224ae5ae",
- "cf582134ed0f48cdaf76f9eb90b00f8f",
- "bab5d8377cee48ababdf98a27767451d",
- "8935c81c9cd046c9a136deb65506af53",
- "dd7c4eceeb5546e4b4796fc7243b696e",
- "ddfe42cc2e1b4455a1ad41c46f6be855",
- "bc394ff0234e43bb853c0e7fff9daa39"
- ]
+ "height": 223
},
- "id": "Vl3SehnxQvXo",
- "outputId": "43ca3865-8b29-41c5-daca-8111a1d51b10"
+ "id": "8d561462",
+ "outputId": "75ae0f32-115f-427e-e426-9a018884e860"
},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded 20 movie entries\n"
+ ]
+ },
{
"data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 20,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"Explosive Pursuit\",\n \"Despicable Me\",\n \"The Incredibles\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"genre\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"comedy\",\n \"action\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 6,\n \"max\": 9,\n \"num_unique_values\": 4,\n \"samples\": [\n 8,\n 9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"description\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"A daring cop chases a notorious criminal across the city in a high-stakes game of cat and mouse.\",\n \"When a criminal mastermind uses a trio of orphan girls as pawns for a grand scheme, he finds their love is profoundly changing him for the better.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe",
+ "variable_name": "df"
+ },
"text/html": [
- "\n",
+ "\n",
+ "
\n",
+ "
\n",
"\n",
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " title genre rating \\\n",
+ "0 Explosive Pursuit action 7 \n",
+ "1 Skyfall action 8 \n",
+ "2 Fast & Furious 9 action 6 \n",
+ "3 Black Widow action 7 \n",
+ "4 John Wick action 8 \n",
+ "\n",
+ " description \n",
+ "0 A daring cop chases a notorious criminal acros... \n",
+ "1 James Bond returns to track down a dangerous n... \n",
+ "2 Dom and his crew face off against a high-tech ... \n",
+ "3 Natasha Romanoff confronts her dark past and f... \n",
+ "4 A retired hitman seeks vengeance against those... "
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import json\n",
+ "\n",
+ "df = pd.read_json(\"resources/movies.json\")\n",
+ "print(\"Loaded\", len(df), \"movie entries\")\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "bfiTJovpQX90",
+ "metadata": {
+ "id": "bfiTJovpQX90"
+ },
+ "outputs": [],
+ "source": [
+ "from redisvl.utils.vectorize import HFTextVectorizer\n",
+ "\n",
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+ "\n",
+ "\n",
+ "hf = HFTextVectorizer(\"sentence-transformers/all-MiniLM-L6-v2\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "Vl3SehnxQvXo",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "Vl3SehnxQvXo",
+ "outputId": "6b9f5555-dee7-4fd6-8dae-628919cfdc74"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 20,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"Explosive Pursuit\",\n \"Despicable Me\",\n \"The Incredibles\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"genre\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"comedy\",\n \"action\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 6,\n \"max\": 9,\n \"num_unique_values\": 4,\n \"samples\": [\n 8,\n 9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"description\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"A daring cop chases a notorious criminal across the city in a high-stakes game of cat and mouse.\",\n \"When a criminal mastermind uses a trio of orphan girls as pawns for a grand scheme, he finds their love is profoundly changing him for the better.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vector\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"b'\\\\x9bf|=\\\\na\\\\n;\\\\xbf\\\\x91\\\\xb7;\\\\x19\\\\xcb~\\\\xbd\\\\xd9d\\\\xce\\\\xbb\\\\xda\\\\x16J=X\\\\xa7?=\\\\xd4v\\\\x95\\\\x17\\\\xbe\\\\x14\\\\x11\\\\x05\\\\xb94u\\\\xbf<\\\\xc3\\\\xe0b\\\\xba\\\\xd0\\\\xa6\\\\xa8\\\\xbd\\\\x84\\\\xdc\\\\xec\\\\xbcTc%=\\\\xfe\\\\xe6r\\\\xbb+OG=5(\\\\x85=s@\\\\xa2\\\\xbc.Z\\\\xd0\\\\xbd;%K\\\\xbd\\\\xa5\\\\xed\\\\x94\\\\xbcn\\\\xddH=\\\\xbb&F<\\\\xc8*\\\\xec<\\\\x8d\\\\xd8\\\\x8d\\\\xbd\\\\xc9Z\\\\x98<\\\\r\\\\xa3\\\\xa3=:g3\\\\xbd\\\\x1f\\\\xcd\\\\xbd\\\\xbd\\\\x11%\\\\xf7;\\\\r\\\\xf5z=\\\\x02\\\\xb5\\\\x8c=\\\\x91\\\\x0e\\\\xc6\\\\xbdlI\\\\x90\\\\xbd%\\\\x16\\\\xbd;}\\\\xe7\\\\x0c\\\\xbd!3\\\\xc9\\\\xbct\\\\xf8\\\\xbb\\\\xbc\\\\xd2&u\\\\xbbA\\\\x8f\\\\xca<\\\\xfe\\\\x7fJ=\\\\x0b\\\\xaf*=\\\\x8dOU\\\\xbd\\\\xcd\\\\xf0\\\\x95\\\\xbc\\\\x1d\\\\x02\\\\x19=1\\\\xf4K<\\\\xcf\\\\xc2\\\\t=H\\\\x83\\\\xac=\\\\x9e\\\\xd7\\\\xb8\\\\xbd\\\\xf4\\\\xb5\\\\x9c\\\\xbd9\\\\x85\\\\x18=\\\\x9cd&=93\\\\xf8<\\\\xf2\\\\xf7\\\\x88<5v\\\\xf2\\\\xbb$=[\\\\xbd\\\\xa3\\\\xac\\\\xee\\\\xbb7:A\\\\xbd\\\\xd9d\\\\x19\\\\xbd\\\\xb7c\\\\xf2\\\\xbb\\\\x84\\\\xb9x;\\\\xb0;O<\\\\xc11,\\\\xbc\\\\xe4\\\\xae\\\\xae=\\\\x9f\\\\x00-\\\\xbc\\\\x14\\\\x06\\\\xae\\\\xbdh\\\\xd6\\\\x1a=\\\\xc4\\\\xbf\\\\xcd=\\\\x19\\\\x150=\\\\xe8\\\\xf1\\\\x9d\\\\xbc\\\\xaaGK=\\\\xaf\\\\xb8 =\\\\xb2\\\\xf1I\\\\xbdIe\\\\x9e\\\\xbb/\\\\x89\\\\xf7:\\\\x94\\\\xf8\\\\x1c=\\\\xa2\\\\xba\\\\xde<\\\\xa7o\\\\x16\\\\xbb\\\\t^p\\\\xbb\\\\xef\\\\xd5<<#\\\\xa6\\\\xa3\\\\xb8\\\\xc99s<\\\\xe83&<]\\\\x1c\\\\x18<\\\\x1c\\\\xd9-\\\\xbd\\\\xd3\\\\xe6\\\\x98<\\\\x0f\\\\xa1N=\\\\xa1/\\\\xa5=\\\\x1e\\\\xf3\\\\xddG\\\\xd6\\\\xbc\\\\x91\\\"S=\\\\xd7\\\\xd9^\\\\xbd\\\\xac\\\\xa3\\\\x91<\\\\xe5\\\\xd9\\\\x13<\\\\xbb\\\\xb2y\\\\xbbw\\\\x8d/\\\\xbd\\\\x99\\\\x06p\\\\xbd\\\\x83\\\\x1bF\\\\xbd\\\\xa2?\\\\x14\\\\xbe\\\\xc8\\\\x8f(\\\\xbd\\\\xe7O\\\\x89\\\\xbd\\\\x12\\\\xae\\\\xd4<\\\\xa6\\\\x12\\\\xc3=\\\\xb2\\\\x05O\\\\xbdZ\\\\x8ep\\\\xbc\\\\x1d\\\\xb5\\\\xac\\\\xbc\\\\xcc\\\\x9ee\\\\xbdf\\\\x8es;Ia\\\\xc1;\\\\xe5\\\\xfaB\\\\xbd\\\\x86\\\"\\\\xfe:\\\\x9c\\\\xe6\\\\xf4=\\\\xf6\\\\x15*<\\\\x81\\\\xf8\\\\x1b=\\\\x04\\\\xfcV\\\\xbd\\\\xd1\\\\xd1\\\\r==\\\\xee\\\\x06=\\\\x0cu\\\\xba\\\\xbd\\\\x10\\\\xa4\\\\xd6<\\\\xe3\\\\xeb\\\\xd9;\\\\xbe9/=\\\\xa9\\\\xc2\\\\x85=~\\\\x0b\\\"=\\\\xffi\\\\xef<7\\\\xe8c=\\\\xfb2\\\\x08\\\\xbe\\\\xe1\\\\x12;=YVW;P\\\\xa4b<\\\\xc8\\\\x9d\\\\xb7<\\\\x7fr;\\\\xbdhz\\\\x91\\\\xbcT\\\\x00<\\\\xbd\\\\x00\\\\x1a\\\\xa3<\\\\xca\\\\t\\\\xbb\\\\xa1\\\\xfb\\\\xe7\\\\xa5\\\\x9f\\\\x0c\\\\xbc\\\\x07Q\\\\x9a\\\\xbd\\\\xb3\\\\x08y\\\\xbd\\\\xdaAT;\\\\xddT\\\\xe2<\\\\xfe\\\\xff\\\\x1c\\\\xbd\\\\x8b\\\\xe4\\\\x9e=\\\\x8c-\\\\x0c;\\\\xc3\\\\x0f>;[8\\\\xea=>\\\\xb7\\\\xd5\\\\xbcN\\\\x8c\\\\xf9\\\\xbc\\\\xd7\\\\xc7\\\\xd2\\\\xbaa8\\\\t<\\\\t\\\\x8a\\\\x17\\\\xbdP\\\\x12A\\\\xbd\\\\x90\\\\x89\\\\x82\\\\xbbFy\\\\xc7=,\\\\xddy\\\\xbd\\\\xd2\\\\xf1\\\\x82<\\\\x1c\\\\xe0\\\\xb0<\\\\xdd\\\\x12\\\\xc8<\\\\xd5M\\\\xdf\\\\xbc\\\\x9f\\\\x16\\\\x9a=\\\\xa2W\\\\xb2<\\\\xcbab;\\\\x9di\\\\x96\\\\xbco\\\\x00W<\\\\'\\\\xb6\\\\xe4\\\\xbc\\\\x07 \\\\xb8;^\\\\x0bI\\\\xbdQ\\\\xc0\\\\xbe\\\\xbc\\\\x92n\\\\x95\\\\xbc\\\\x9f\\\\x11\\\\x83=\\\\xd2\\\\xb0\\\\xf5\\\\xbc\\\\xc7g\\\\x8a9i=\\\\x04\\\\xcbT\\\\xbd\\\\xcbn\\\\xdf=\\\\xa4\\\\xe4\\\\xc3\\\\xbd\\\\xaa\\\\xdb\\\\xe6\\\\xbc&\\\\x1fI\\\\xbcC]\\\\x9b=\\\\r\\\\xd3y\\\\xbcb\\\\xc8\\\\xdb=\\\\xd9N\\\\xb8<\\\\xcf^%\\\\xbd\\\\xdeh\\\\xd9<\\\\xe5\\\\x88\\\\xd1\\\\xbd\\\\xce/\\\\t=\\\\xfc\\\\xa7A\\\\xbc\\\\xb6c1=\\\\xf5\\\\xc7R\\\\xbd\\\\xb9\\\\x00G\\\\xbdN\\\\xd1\\\\xbe=\\\\xf9X\\\\x17<\\\\xd4\\\\xea\\\\xd4\\\\xbc\\\\xe3\\\\x96\\\\x8c\\\\xbdx\\\\xd3\\\\xcd;\\\\xab/\\\\xdb<\\\\xd8L\\\\x80\n",
+ " \n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " genre | \n",
+ " rating | \n",
+ " description | \n",
+ " vector | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Explosive Pursuit | \n",
+ " action | \n",
+ " 7 | \n",
+ " A daring cop chases a notorious criminal acros... | \n",
+ " b'\\x9bf|=\\na\\n;\\xbf\\x91\\xb7;\\x19\\xcb~\\xbd\\xd9d... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Skyfall | \n",
+ " action | \n",
+ " 8 | \n",
+ " James Bond returns to track down a dangerous n... | \n",
+ " b'\\x9aD\\x9e\\xbd0\\x9b\\x89\\xbc\\xc3\\x16\\x95\\xbc\\x... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Fast & Furious 9 | \n",
+ " action | \n",
+ " 6 | \n",
+ " Dom and his crew face off against a high-tech ... | \n",
+ " b'*\\xa5\\xc7\\xbc\\xf6,\\xa2=?\\x19H\\xbcK\\xc6t\\xbd\\... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Black Widow | \n",
+ " action | \n",
+ " 7 | \n",
+ " Natasha Romanoff confronts her dark past and f... | \n",
+ " b'u\\xeb\\x85\\xbd\\x0e\\xcdo\\xbd&\\xe8\\xc2\\xbb6\\xcf... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " John Wick | \n",
+ " action | \n",
+ " 8 | \n",
+ " A retired hitman seeks vengeance against those... | \n",
+ " b'\\xaf<x\\xbb\\xfb.\\xc5=B\\x86:;\\xce\\xd0\\x94<\\xf9... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n"
+ ],
+ "text/plain": [
+ " title genre rating \\\n",
+ "0 Explosive Pursuit action 7 \n",
+ "1 Skyfall action 8 \n",
+ "2 Fast & Furious 9 action 6 \n",
+ "3 Black Widow action 7 \n",
+ "4 John Wick action 8 \n",
+ "\n",
+ " description \\\n",
+ "0 A daring cop chases a notorious criminal acros... \n",
+ "1 James Bond returns to track down a dangerous n... \n",
+ "2 Dom and his crew face off against a high-tech ... \n",
+ "3 Natasha Romanoff confronts her dark past and f... \n",
+ "4 A retired hitman seeks vengeance against those... \n",
+ "\n",
+ " vector \n",
+ "0 b'\\x9bf|=\\na\\n;\\xbf\\x91\\xb7;\\x19\\xcb~\\xbd\\xd9d... \n",
+ "1 b'\\x9aD\\x9e\\xbd0\\x9b\\x89\\xbc\\xc3\\x16\\x95\\xbc\\x... \n",
+ "2 b'*\\xa5\\xc7\\xbc\\xf6,\\xa2=?\\x19H\\xbcK\\xc6t\\xbd\\... \n",
+ "3 b'u\\xeb\\x85\\xbd\\x0e\\xcdo\\xbd&\\xe8\\xc2\\xbb6\\xcf... \n",
+ "4 b'\\xaf\n",
+ " \n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " vector_distance | \n",
+ " title | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " movies:01JSHDN7Q4GG029M45HQY8Q5T2 | \n",
+ " 0.64973795414 | \n",
+ " Fast & Furious 9 | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " movies:01JSHDN7Q40QYH6Q6TD7ES4TSG | \n",
+ " 0.763235092163 | \n",
+ " Mad Max: Fury Road | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " movies:01JSHDN7Q4AS7C9VT582PWK14J | \n",
+ " 0.792449712753 | \n",
+ " The Lego Movie | \n",
+ " comedy | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n"
+ ],
+ "text/plain": [
+ " id vector_distance title \\\n",
+ "0 movies:01JSHDN7Q4GG029M45HQY8Q5T2 0.64973795414 Fast & Furious 9 \n",
+ "1 movies:01JSHDN7Q40QYH6Q6TD7ES4TSG 0.763235092163 Mad Max: Fury Road \n",
+ "2 movies:01JSHDN7Q4AS7C9VT582PWK14J 0.792449712753 The Lego Movie \n",
+ "\n",
+ " genre \n",
+ "0 action \n",
+ "1 action \n",
+ "2 comedy "
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from redisvl.query import VectorQuery\n",
+ "\n",
+ "user_query = \"High tech and action packed movie\"\n",
+ "\n",
+ "embedded_user_query = hf.embed(user_query)\n",
+ "\n",
+ "vec_query = VectorQuery(\n",
+ " vector=embedded_user_query,\n",
+ " vector_field_name=\"vector\",\n",
+ " num_results=3,\n",
+ " return_fields=[\"title\", \"genre\"],\n",
+ " return_score=True,\n",
+ ")\n",
+ "\n",
+ "result = index.query(vec_query)\n",
+ "pd.DataFrame(result)\n"
]
},
{
"cell_type": "markdown",
- "id": "24d3ea9c",
+ "id": "ef5e1997",
"metadata": {
- "id": "24d3ea9c"
+ "id": "ef5e1997"
},
"source": [
- "## Populate index"
+ "### Vector search with filters\n",
+ "\n",
+ "Redis allows you to combine filter searches on fields within the index object allowing us to create more specific searches."
]
},
{
- "cell_type": "code",
- "execution_count": 10,
- "id": "169ebb93",
+ "cell_type": "markdown",
+ "id": "kKCzyMUDDw10",
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "169ebb93",
- "outputId": "0e2b0eea-c058-4ee0-8116-197a452b53e0"
+ "id": "kKCzyMUDDw10"
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['rvl:c2f305b7836a40bdafb7223184e26b5b',\n",
- " 'rvl:59d2b0bd431148e9b2c5516b8417eb75',\n",
- " 'rvl:299b8027b07446fda39ca8bf37789776',\n",
- " 'rvl:6406f177887f401b9ab4d45956e324ae',\n",
- " 'rvl:d051b98b40f64279b9dad13f7d4666ce',\n",
- " 'rvl:55a20aec9cb74e33ae9e2d8952bcde25',\n",
- " 'rvl:a5ef870f9a5b41b38e950ea0fa7a9f83',\n",
- " 'rvl:7dd3e91a505a40b8bd1cf50613e4179c',\n",
- " 'rvl:4545e847159f441d843611f05df01983',\n",
- " 'rvl:646f249d9ff646e7ae5b757cd6caad0a',\n",
- " 'rvl:5cbf956aa543497c8e0a4e2a08e7fa53',\n",
- " 'rvl:e3d4efbae7bd49e49300d0c0e1e54426',\n",
- " 'rvl:2dbf334c02854aaab87c196f47cb2729',\n",
- " 'rvl:1dfb97323ddc47fdbc07aaa299658c99',\n",
- " 'rvl:ca2c31d6ce8740ca9e2ce711c0d9ef56',\n",
- " 'rvl:d9ba373d3a174ed1a08d096de012f56d',\n",
- " 'rvl:1e52ad75779046578e9b05499f78ec92',\n",
- " 'rvl:19415f0c3b2646b3abbf994762486e4f',\n",
- " 'rvl:ac862266170c4ee8942bf76791056295',\n",
- " 'rvl:9161346ce4b645dca6833386cc05ade4']"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "index.load(df.to_dict(orient=\"records\"))"
+ "Search for top 3 movies specifically in the action genre:\n"
]
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "ae814790",
+ "execution_count": 56,
+ "id": "d499dcad",
"metadata": {
"colab": {
- "base_uri": "https://localhost:8080/"
+ "base_uri": "https://localhost:8080/",
+ "height": 143
},
- "id": "ae814790",
- "outputId": "d13768b9-a4b8-44a1-b26d-3f8e07f7afde"
+ "id": "d499dcad",
+ "outputId": "ab410048-da42-4b1e-a5fb-fbd6430ba437"
},
"outputs": [
{
"data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"pd\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"movies:01JSHDN7Q4GG029M45HQY8Q5T2\",\n \"movies:01JSHDN7Q40QYH6Q6TD7ES4TSG\",\n \"movies:01JSHDN7Q4AHFG0J8D7Q8QS1BG\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vector_distance\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"0.64973795414\",\n \"0.763235092163\",\n \"0.796153008938\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Fast & Furious 9\",\n \"Mad Max: Fury Road\",\n \"Explosive Pursuit\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"genre\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"action\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " vector_distance | \n",
+ " title | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " movies:01JSHDN7Q4GG029M45HQY8Q5T2 | \n",
+ " 0.64973795414 | \n",
+ " Fast & Furious 9 | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " movies:01JSHDN7Q40QYH6Q6TD7ES4TSG | \n",
+ " 0.763235092163 | \n",
+ " Mad Max: Fury Road | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " movies:01JSHDN7Q4AHFG0J8D7Q8QS1BG | \n",
+ " 0.796153008938 | \n",
+ " Explosive Pursuit | \n",
+ " action | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
"text/plain": [
- "{'index_name': 'movies',\n",
- " 'index_options': [],\n",
- " 'index_definition': ['key_type',\n",
- " 'HASH',\n",
- " 'prefixes',\n",
- " ['rvl'],\n",
- " 'default_score',\n",
- " '1'],\n",
- " 'attributes': [['identifier',\n",
- " 'title',\n",
- " 'attribute',\n",
- " 'title',\n",
- " 'type',\n",
- " 'TEXT',\n",
- " 'WEIGHT',\n",
- " '1'],\n",
- " ['identifier',\n",
- " 'description',\n",
- " 'attribute',\n",
- " 'description',\n",
- " 'type',\n",
- " 'TEXT',\n",
- " 'WEIGHT',\n",
- " '1'],\n",
- " ['identifier',\n",
- " 'genre',\n",
- " 'attribute',\n",
- " 'genre',\n",
- " 'type',\n",
- " 'TAG',\n",
- " 'SEPARATOR',\n",
- " ',',\n",
- " 'SORTABLE'],\n",
- " ['identifier',\n",
- " 'rating',\n",
- " 'attribute',\n",
- " 'rating',\n",
- " 'type',\n",
- " 'NUMERIC',\n",
- " 'SORTABLE',\n",
- " 'UNF'],\n",
- " ['identifier',\n",
- " 'vector',\n",
- " 'attribute',\n",
- " 'vector',\n",
- " 'type',\n",
- " 'VECTOR',\n",
- " 'algorithm',\n",
- " 'HNSW',\n",
- " 'data_type',\n",
- " 'FLOAT32',\n",
- " 'dim',\n",
- " 384,\n",
- " 'distance_metric',\n",
- " 'COSINE',\n",
- " 'M',\n",
- " 16,\n",
- " 'ef_construction',\n",
- " 200]],\n",
- " 'num_docs': 20,\n",
- " 'max_doc_id': 20,\n",
- " 'num_terms': 432,\n",
- " 'num_records': 583,\n",
- " 'inverted_sz_mb': '0.043068885803222656',\n",
- " 'vector_index_sz_mb': '1.7178497314453125',\n",
- " 'total_inverted_index_blocks': 436,\n",
- " 'offset_vectors_sz_mb': '5.283355712890625e-4',\n",
- " 'doc_table_size_mb': '0.001983642578125',\n",
- " 'sortable_values_size_mb': '0.00102996826171875',\n",
- " 'key_table_size_mb': '7.74383544921875e-4',\n",
- " 'tag_overhead_sz_mb': '5.53131103515625e-5',\n",
- " 'text_overhead_sz_mb': '0.0144195556640625',\n",
- " 'total_index_memory_sz_mb': '0.06429290771484375',\n",
- " 'geoshapes_sz_mb': '0',\n",
- " 'records_per_doc_avg': '29.149999618530273',\n",
- " 'bytes_per_record_avg': '77.46311950683594',\n",
- " 'offsets_per_term_avg': '0.9502573013305664',\n",
- " 'offset_bits_per_record_avg': '8',\n",
- " 'hash_indexing_failures': 0,\n",
- " 'total_indexing_time': '1.5219999551773071',\n",
- " 'indexing': 0,\n",
- " 'percent_indexed': '1',\n",
- " 'number_of_uses': 1,\n",
- " 'cleaning': 0,\n",
- " 'gc_stats': ['bytes_collected',\n",
- " '0',\n",
- " 'total_ms_run',\n",
- " '0',\n",
- " 'total_cycles',\n",
- " '0',\n",
- " 'average_cycle_time_ms',\n",
- " 'nan',\n",
- " 'last_run_time_ms',\n",
- " '0',\n",
- " 'gc_numeric_trees_missed',\n",
- " '0',\n",
- " 'gc_blocks_denied',\n",
- " '0'],\n",
- " 'cursor_stats': ['global_idle',\n",
- " 0,\n",
- " 'global_total',\n",
- " 0,\n",
- " 'index_capacity',\n",
- " 128,\n",
- " 'index_total',\n",
- " 0],\n",
- " 'dialect_stats': ['dialect_1',\n",
- " 0,\n",
- " 'dialect_2',\n",
- " 0,\n",
- " 'dialect_3',\n",
- " 0,\n",
- " 'dialect_4',\n",
- " 0],\n",
- " 'Index Errors': ['indexing failures',\n",
- " 0,\n",
- " 'last indexing error',\n",
- " 'N/A',\n",
- " 'last indexing error key',\n",
- " 'N/A'],\n",
- " 'field statistics': [['identifier',\n",
- " 'title',\n",
- " 'attribute',\n",
- " 'title',\n",
- " 'Index Errors',\n",
- " ['indexing failures',\n",
- " 0,\n",
- " 'last indexing error',\n",
- " 'N/A',\n",
- " 'last indexing error key',\n",
- " 'N/A']],\n",
- " ['identifier',\n",
- " 'description',\n",
- " 'attribute',\n",
- " 'description',\n",
- " 'Index Errors',\n",
- " ['indexing failures',\n",
- " 0,\n",
- " 'last indexing error',\n",
- " 'N/A',\n",
- " 'last indexing error key',\n",
- " 'N/A']],\n",
- " ['identifier',\n",
- " 'genre',\n",
- " 'attribute',\n",
- " 'genre',\n",
- " 'Index Errors',\n",
- " ['indexing failures',\n",
- " 0,\n",
- " 'last indexing error',\n",
- " 'N/A',\n",
- " 'last indexing error key',\n",
- " 'N/A']],\n",
- " ['identifier',\n",
- " 'rating',\n",
- " 'attribute',\n",
- " 'rating',\n",
- " 'Index Errors',\n",
- " ['indexing failures',\n",
- " 0,\n",
- " 'last indexing error',\n",
- " 'N/A',\n",
- " 'last indexing error key',\n",
- " 'N/A']],\n",
- " ['identifier',\n",
- " 'vector',\n",
- " 'attribute',\n",
- " 'vector',\n",
- " 'Index Errors',\n",
- " ['indexing failures',\n",
- " 0,\n",
- " 'last indexing error',\n",
- " 'N/A',\n",
- " 'last indexing error key',\n",
- " 'N/A']]]}"
+ " id vector_distance title \\\n",
+ "0 movies:01JSHDN7Q4GG029M45HQY8Q5T2 0.64973795414 Fast & Furious 9 \n",
+ "1 movies:01JSHDN7Q40QYH6Q6TD7ES4TSG 0.763235092163 Mad Max: Fury Road \n",
+ "2 movies:01JSHDN7Q4AHFG0J8D7Q8QS1BG 0.796153008938 Explosive Pursuit \n",
+ "\n",
+ " genre \n",
+ "0 action \n",
+ "1 action \n",
+ "2 action "
]
},
- "execution_count": 11,
+ "execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "index.info()"
+ "from redisvl.query.filter import Tag\n",
+ "\n",
+ "tag_filter = Tag(\"genre\") == \"action\"\n",
+ "\n",
+ "vec_query.set_filter(tag_filter)\n",
+ "\n",
+ "result=index.query(vec_query)\n",
+ "pd.DataFrame(result)"
]
},
{
"cell_type": "markdown",
- "id": "87ba1dfd",
+ "id": "YAh3GDS4Dudu",
"metadata": {
- "id": "87ba1dfd"
+ "id": "YAh3GDS4Dudu"
},
"source": [
- "## Index loaded now we can perform vector search\n",
- "\n",
- "### basic vector search"
+ "Search for top 3 movies specifically in the action genre with ratings at or above a 7:\n"
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "9454e60d",
+ "execution_count": 57,
+ "id": "f59fff2c",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
- "height": 175,
- "referenced_widgets": [
- "976416d95f3249f39cec209026f90acf",
- "836d9d5318364d74b83cee3bbe2111d1",
- "19f47b9533b744d8bd8b30c2e9c4cbe7",
- "2d9021782ce449749906815fd91dbb79",
- "ecb804cbb4e448ceb70a53698db36a3f",
- "adc4c630cc6745ee92f72c8ddb06719f",
- "6a7d5463952048368b13493f500a4b34",
- "c20fffa299d34b15939eec4290953f88",
- "d56cf898bd674ffba8466f2e49903742",
- "a7624d50778c45028d9cf79d72c3089f",
- "6eab1db652d3452e9bfefd79d56056c5"
- ]
+ "height": 143
},
- "id": "9454e60d",
- "outputId": "b51d5902-6891-4170-84c1-d600d3e41b1e"
+ "id": "f59fff2c",
+ "outputId": "d6909c59-a947-4e58-a13a-8d0c2169a6b3"
},
"outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "976416d95f3249f39cec209026f90acf",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Batches: 0%| | 0/1 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
- "summary": "{\n \"name\": \"pd\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"rvl:299b8027b07446fda39ca8bf37789776\",\n \"rvl:19415f0c3b2646b3abbf994762486e4f\",\n \"rvl:d9ba373d3a174ed1a08d096de012f56d\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vector_distance\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"0.685773432255\",\n \"0.801602959633\",\n \"0.812341988087\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Fast & Furious 9\",\n \"Despicable Me\",\n \"The Incredibles\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"6\",\n \"7\",\n \"8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"genre\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"comedy\",\n \"action\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "summary": "{\n \"name\": \"pd\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"movies:01JSHDN7Q40QYH6Q6TD7ES4TSG\",\n \"movies:01JSHDN7Q4AHFG0J8D7Q8QS1BG\",\n \"movies:01JSHDN7Q481PGAEDBX0QG75RP\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vector_distance\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"0.763235092163\",\n \"0.796153008938\",\n \"0.87649422884\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Mad Max: Fury Road\",\n \"Explosive Pursuit\",\n \"Inception\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"8\",\n \"7\",\n \"9\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"genre\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"action\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe"
},
"text/html": [
"\n",
- "