diff --git a/python-recipes/RAG/01_redisvl.ipynb b/python-recipes/RAG/01_redisvl.ipynb index fef526d3..b1d2d34c 100644 --- a/python-recipes/RAG/01_redisvl.ipynb +++ b/python-recipes/RAG/01_redisvl.ipynb @@ -35,40 +35,40 @@ }, { "cell_type": "code", + "execution_count": 8, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T04:41:18.607703Z", + "start_time": "2025-04-24T04:41:11.664107Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "AJJ2UW6M1ui0", - "outputId": "0f5773b7-a292-4ee6-f4bd-20dc40ca2aba", - "ExecuteTime": { - "end_time": "2025-04-24T04:41:18.607703Z", - "start_time": "2025-04-24T04:41:11.664107Z" - } + "outputId": "0f5773b7-a292-4ee6-f4bd-20dc40ca2aba" }, - "source": [ - "# NBVAL_SKIP\n", - "!git clone https://github.com/redis-developer/redis-ai-resources.git temp_repo\n", - "!mv temp_repo/python-recipes/RAG/resources .\n", - "!rm -rf temp_repo" - ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into 'temp_repo'...\r\n", - "remote: Enumerating objects: 679, done.\u001B[K\r\n", - "remote: Counting objects: 100% (330/330), done.\u001B[Kjects: 82% (271/330)\u001B[K\r\n", - "remote: Compressing objects: 100% (214/214), done.\u001B[K\r\n", - "remote: Total 679 (delta 227), reused 148 (delta 115), pack-reused 349 (from 2)\u001B[K\r\n", + "remote: Enumerating objects: 679, done.\u001b[K\r\n", + "remote: Counting objects: 100% (330/330), done.\u001b[Kjects: 82% (271/330)\u001b[K\r\n", + "remote: Compressing objects: 100% (214/214), done.\u001b[K\r\n", + "remote: Total 679 (delta 227), reused 148 (delta 115), pack-reused 349 (from 2)\u001b[K\r\n", "Receiving objects: 100% (679/679), 57.80 MiB | 11.09 MiB/s, done.\r\n", "Resolving deltas: 100% (295/295), done.\r\n", "mv: rename temp_repo/python-recipes/RAG/resources to ./resources: Directory not empty\r\n" ] } ], - "execution_count": 8 + "source": [ + "# NBVAL_SKIP\n", + "!git clone https://github.com/redis-developer/redis-ai-resources.git temp_repo\n", + "!mv temp_repo/python-recipes/RAG/resources .\n", + "!rm -rf temp_repo" + ] }, { "cell_type": "markdown", @@ -81,33 +81,33 @@ }, { "cell_type": "code", + "execution_count": 9, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T04:41:20.572419Z", + "start_time": "2025-04-24T04:41:18.616143Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "DgxBQFXQ1ui2", - "outputId": "c3c399d6-e294-4a3a-a0a3-82d818509991", - "ExecuteTime": { - "end_time": "2025-04-24T04:41:20.572419Z", - "start_time": "2025-04-24T04:41:18.616143Z" - } + "outputId": "c3c399d6-e294-4a3a-a0a3-82d818509991" }, - "source": [ - "%pip install -q redis \"redisvl>=0.4.1\" langchain-community pypdf sentence-transformers langchain openai pandas" - ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.0.1\u001B[0m\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], - "execution_count": 9 + "source": [ + "%pip install -q \"redisvl>=0.6.0\" langchain-community pypdf sentence-transformers langchain openai pandas" + ] }, { "cell_type": "markdown", @@ -134,6 +134,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -141,6 +142,7 @@ "id": "c0d5lfNxJkD8", "outputId": "f96e72fa-b9f3-476f-bc9e-328bd30d1344" }, + "outputs": [], "source": [ "# NBVAL_SKIP\n", "%%sh\n", @@ -149,9 +151,7 @@ "sudo apt-get update > /dev/null 2>&1\n", "sudo apt-get install redis-stack-server > /dev/null 2>&1\n", "redis-stack-server --daemonize yes" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -180,13 +180,15 @@ }, { "cell_type": "code", + "execution_count": 3, "metadata": { - "id": "ggh5TzhkJkD9", "ExecuteTime": { "end_time": "2025-04-24T16:46:45.583246Z", "start_time": "2025-04-24T16:46:45.581177Z" - } + }, + "id": "ggh5TzhkJkD9" }, + "outputs": [], "source": [ "import os\n", "\n", @@ -197,9 +199,7 @@ "\n", "# If SSL is enabled on the endpoint, use rediss:// as the URL prefix\n", "REDIS_URL = f\"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}\"" - ], - "outputs": [], - "execution_count": 3 + ] }, { "cell_type": "markdown", @@ -227,27 +227,18 @@ }, { "cell_type": "code", + "execution_count": 4, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T16:46:46.043726Z", + "start_time": "2025-04-24T16:46:45.600472Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "uijl2qFH1ui3", - "outputId": "a99b3fcb-7cfd-4dbd-f258-57779cfcae3c", - "ExecuteTime": { - "end_time": "2025-04-24T16:46:46.043726Z", - "start_time": "2025-04-24T16:46:45.600472Z" - } + "outputId": "a99b3fcb-7cfd-4dbd-f258-57779cfcae3c" }, - "source": [ - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "from langchain_community.document_loaders import PyPDFLoader\n", - "\n", - "# Load list of pdfs from a folder\n", - "data_path = \"resources/\"\n", - "docs = [os.path.join(data_path, file) for file in os.listdir(data_path)]\n", - "\n", - "print(\"Listing available documents ...\", docs)" - ], "outputs": [ { "name": "stdout", @@ -257,21 +248,40 @@ ] } ], - "execution_count": 4 + "source": [ + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain_community.document_loaders import PyPDFLoader\n", + "\n", + "# Load list of pdfs from a folder\n", + "data_path = \"resources/\"\n", + "docs = [os.path.join(data_path, file) for file in os.listdir(data_path)]\n", + "\n", + "print(\"Listing available documents ...\", docs)" + ] }, { "cell_type": "code", + "execution_count": 5, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T16:46:50.509810Z", + "start_time": "2025-04-24T16:46:46.104219Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "anya8hVnT6K_", - "outputId": "a8430acc-2e6d-45fd-fc8b-601fbbd8289b", - "ExecuteTime": { - "end_time": "2025-04-24T16:46:50.509810Z", - "start_time": "2025-04-24T16:46:46.104219Z" - } + "outputId": "a8430acc-2e6d-45fd-fc8b-601fbbd8289b" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done preprocessing. Created 211 chunks of the original pdf resources/nke-10k-2023.pdf\n" + ] + } + ], "source": [ "# pick out the Nike doc for this exercise\n", "doc = [doc for doc in docs if \"nke\" in doc][0]\n", @@ -286,17 +296,7 @@ "chunks = loader.load_and_split(text_splitter)\n", "\n", "print(\"Done preprocessing. Created\", len(chunks), \"chunks of the original pdf\", doc)" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done preprocessing. Created 211 chunks of the original pdf resources/nke-10k-2023.pdf\n" - ] - } - ], - "execution_count": 5 + ] }, { "cell_type": "markdown", @@ -310,7 +310,12 @@ }, { "cell_type": "code", + "execution_count": 6, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T16:46:55.588165Z", + "start_time": "2025-04-24T16:46:50.528240Z" + }, "colab": { "base_uri": "https://localhost:8080/", "height": 661, @@ -461,41 +466,44 @@ ] }, "id": "N3iQ2aLEJkD9", - "outputId": "b0f0d2c1-41dc-4932-990b-53d2912af19e", - "ExecuteTime": { - "end_time": "2025-04-24T16:46:55.588165Z", - "start_time": "2025-04-24T16:46:50.528240Z" - } + "outputId": "b0f0d2c1-41dc-4932-990b-53d2912af19e" }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import warnings\n", "import pandas as pd\n", "from redisvl.utils.vectorize import HFTextVectorizer, BaseVectorizer\n", + "from redisvl.extensions.cache.embeddings import EmbeddingsCache\n", "\n", "warnings.filterwarnings(\"ignore\")\n", - "\n", - "hf = HFTextVectorizer(\"sentence-transformers/all-MiniLM-L6-v2\")\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", + "hf = HFTextVectorizer(\n", + " model=\"sentence-transformers/all-MiniLM-L6-v2\",\n", + " cache=EmbeddingsCache(\n", + " name=\"embedcache\",\n", + " ttl=600,\n", + " redis_url=REDIS_URL,\n", + " )\n", + ")\n", + "\n", "# Embed each chunk content\n", "embeddings = hf.embed_many([chunk.page_content for chunk in chunks])\n", "\n", "# Check to make sure we've created enough embeddings, 1 per document chunk\n", "len(embeddings) == len(chunks)" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 6 + ] }, { "cell_type": "markdown", @@ -510,20 +518,21 @@ }, { "cell_type": "code", + "execution_count": 7, "metadata": { - "id": "zB1EW_9n1ui-", "ExecuteTime": { "end_time": "2025-04-24T16:46:55.611260Z", "start_time": "2025-04-24T16:46:55.598846Z" - } + }, + "id": "zB1EW_9n1ui-" }, + "outputs": [], "source": [ "from redisvl.index import SearchIndex\n", "\n", "\n", "index_name = \"redisvl\"\n", "\n", - "\n", "schema = {\n", " \"index\": {\n", " \"name\": index_name,\n", @@ -553,24 +562,18 @@ " }\n", " ]\n", "}" - ], - "outputs": [], - "execution_count": 7 + ] }, { "cell_type": "code", + "execution_count": 8, "metadata": { - "id": "LKuQku2CJkD9", "ExecuteTime": { "end_time": "2025-04-24T16:46:55.630056Z", "start_time": "2025-04-24T16:46:55.620207Z" - } + }, + "id": "LKuQku2CJkD9" }, - "source": [ - "# create an index from schema and the client\n", - "index = SearchIndex.from_dict(schema, redis_url=REDIS_URL)\n", - "index.create(overwrite=True, drop=True)" - ], "outputs": [ { "name": "stdout", @@ -580,10 +583,15 @@ ] } ], - "execution_count": 8 + "source": [ + "# create an index from schema and the client\n", + "index = SearchIndex.from_dict(schema, redis_url=REDIS_URL)\n", + "index.create(overwrite=True, drop=True)" + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -591,30 +599,26 @@ "id": "L6GOqmeN1ui_", "outputId": "91a199e3-d087-4b15-9544-d59efa6033c5" }, + "outputs": [], "source": [ "# use the RedisVL CLI tool to list all indices\n", "!rvl index listall" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": 10, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T16:46:56.828176Z", + "start_time": "2025-04-24T16:46:56.283831Z" + }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "C70C-UWj1ujA", - "outputId": "1fb7a2d6-ae6d-4536-b4b7-702620efd128", - "ExecuteTime": { - "end_time": "2025-04-24T16:46:56.828176Z", - "start_time": "2025-04-24T16:46:56.283831Z" - } + "outputId": "1fb7a2d6-ae6d-4536-b4b7-702620efd128" }, - "source": [ - "# get info about the index\n", - "!rvl index info -i redisvl" - ], "outputs": [ { "name": "stdout", @@ -639,7 +643,10 @@ ] } ], - "execution_count": 10 + "source": [ + "# get info about the index\n", + "!rvl index info -i redisvl" + ] }, { "cell_type": "markdown", @@ -653,13 +660,15 @@ }, { "cell_type": "code", + "execution_count": 11, "metadata": { - "id": "Zsg09Keg1ujA", "ExecuteTime": { "end_time": "2025-04-24T16:46:56.895623Z", "start_time": "2025-04-24T16:46:56.836700Z" - } + }, + "id": "Zsg09Keg1ujA" }, + "outputs": [], "source": [ "# load expects an iterable of dictionaries\n", "from redisvl.redis.utils import array_to_buffer\n", @@ -675,9 +684,7 @@ "\n", "# RedisVL handles batching automatically\n", "keys = index.load(data, id_field=\"chunk_id\")" - ], - "outputs": [], - "execution_count": 11 + ] }, { "cell_type": "markdown", @@ -691,7 +698,12 @@ }, { "cell_type": "code", + "execution_count": 12, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T16:46:56.991529Z", + "start_time": "2025-04-24T16:46:56.903370Z" + }, "colab": { "base_uri": "https://localhost:8080/", "height": 85, @@ -710,12 +722,20 @@ ] }, "id": "BkFv-_iC1ujB", - "outputId": "c398d356-6bb7-43a9-ca95-cb7f167d1f38", - "ExecuteTime": { - "end_time": "2025-04-24T16:46:56.991529Z", - "start_time": "2025-04-24T16:46:56.903370Z" - } + "outputId": "c398d356-6bb7-43a9-ca95-cb7f167d1f38" }, + "outputs": [ + { + "data": { + "text/plain": [ + "'*=>[KNN 3 @text_embedding $vector AS vector_distance] RETURN 3 chunk_id content vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from redisvl.query import VectorQuery\n", "\n", @@ -733,56 +753,26 @@ "\n", "# show the raw redis query\n", "str(vector_query)" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "'*=>[KNN 3 @text_embedding $vector AS vector_distance] RETURN 3 chunk_id content vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 12 + ] }, { "cell_type": "code", + "execution_count": 13, "metadata": { + "ExecuteTime": { + "end_time": "2025-04-24T16:46:57.008139Z", + "start_time": "2025-04-24T16:46:56.999381Z" + }, "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "id": "5reL5qTW1ujC", - "outputId": "dd58f191-54f5-4226-c4e1-70207d58f2dc", - "ExecuteTime": { - "end_time": "2025-04-24T16:46:57.008139Z", - "start_time": "2025-04-24T16:46:56.999381Z" - } + "outputId": "dd58f191-54f5-4226-c4e1-70207d58f2dc" }, - "source": [ - "# execute the query with RedisVL\n", - "result=index.query(vector_query)\n", - "\n", - "# view the results\n", - "pd.DataFrame(result)" - ], "outputs": [ { "data": { - "text/plain": [ - " id vector_distance chunk_id \\\n", - "0 chunk:88 0.337694525719 88 \n", - "1 chunk:80 0.34205275774 80 \n", - "2 chunk:87 0.357761025429 87 \n", - "\n", - " content \n", - "0 Asia Pacific & Latin America 1,932 1,896 2 % 1... \n", - "1 Table of Contents\\nCONSOLIDATED OPERATING RESU... \n", - "2 Table of Contents\\nOPERATING SEGMENTS\\nAs disc... " - ], "text/html": [ "
\n", "