RAAE-326 (#32)

rbs333 · web-flow · commit 4f1f64ee5c84 · 2024-09-27T16:45:34.000-04:00
* update llama-notebook
diff --git a/python-recipes/RAG/03_llamaindex.ipynb b/python-recipes/RAG/03_llamaindex.ipynb
@@ -33,30 +33,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "UQezgPCG1vml",
         "outputId": "97b9bc03-da1b-439a-c37b-be6fdb58ab21"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Cloning into 'temp_repo'...\n",
-            "remote: Enumerating objects: 138, done.\u001b[K\n",
-            "remote: Counting objects: 100% (138/138), done.\u001b[K\n",
-            "remote: Compressing objects: 100% (98/98), done.\u001b[K\n",
-            "remote: Total 138 (delta 68), reused 91 (delta 35), pack-reused 0\u001b[K\n",
-            "Receiving objects: 100% (138/138), 7.19 MiB | 4.45 MiB/s, done.\n",
-            "Resolving deltas: 100% (68/68), done.\n",
-            "mv: rename temp_repo/resources to ./resources: Directory not empty\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "# NBVAL_SKIP\n",
         "!git clone https://github.com/redis-developer/redis-ai-resources.git temp_repo\n",
@@ -200,7 +185,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Sample doc Doc ID: 67e07154-6ea0-4822-8957-ac1d212fc9ee\n",
+            "Sample doc Doc ID: c013353e-dae7-4d17-befd-9e784c8acf79\n",
             "Text: UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington,\n",
             "D.C. 20549 FORM 10-K (Mark One) ☒ ANNUAL  REPORT PURSUANT T O SECTION\n",
             "13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year\n",
@@ -245,13 +230,13 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 4,
       "metadata": {},
       "outputs": [],
       "source": [
         "from llama_index.core import StorageContext\n",
         "\n",
-        "vector_store = RedisVectorStore(redis_url=REDIS_URL, index_name=\"llama\", overwrite=True)\n",
+        "vector_store = RedisVectorStore(redis_url=REDIS_URL, overwrite=True)\n",
         "\n",
         "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
         "\n",
@@ -267,7 +252,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 11,
+      "execution_count": 5,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -285,14 +270,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 12,
+      "execution_count": 6,
       "metadata": {},
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Node ID: b561dd17-5545-4d3a-bc4f-18cb39c7c01e\n",
+            "Node ID: d2e6cd9c-0716-49d8-8563-407a00d05445\n",
             "Text: Table of Contents FISCAL 2023 NIKE BRAND REVENUE HIGHLIGHTS The\n",
             "following tables present NIKE Brand revenues disaggregated by\n",
             "reportable operating segment, distribution channel and major product\n",
@@ -301,14 +286,14 @@
             "fiscal 2022 on...\n",
             "Score:  0.900\n",
             "\n",
-            "Node ID: 0415f059-9258-426b-8b21-34b287b3c21b\n",
+            "Node ID: 28542d3b-b345-4e9e-b675-f62361ec85d9\n",
             "Text: Table of Contents NORTH AMERICA (Dollars in millions) FISCAL\n",
             "2023FISCAL 2022 % CHANGE% CHANGE EXCLUDING CURRENCY CHANGESFISCAL 2021\n",
             "% CHANGE% CHANGE EXCLUDING CURRENCY CHANGES Revenues by: Footwear $\n",
             "14,897 $ 12,228 22 % 22 %$ 11,644 5 % 5 % Apparel 5,947 5,492 8 % 9 %\n",
             "5,028 9 % 9 % Equipment 764 633 21 % 21 % 507 25 % 25 % TOTAL REVENUES\n",
             "$ 21,6...\n",
-            "Score:  0.886\n",
+            "Score:  0.885\n",
             "\n"
           ]
         }
@@ -329,7 +314,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 13,
+      "execution_count": 7,
       "metadata": {},
       "outputs": [
         {
@@ -338,7 +323,7 @@
               "\"NIKE's revenue in fiscal 23 was $51.2 billion.\""
             ]
           },
-          "execution_count": 13,
+          "execution_count": 7,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -348,6 +333,228 @@
         "response.response"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Use a custom index schema\n",
+        "\n",
+        "In most use cases, you need the ability to customize the underling index configuration\n",
+        "and specification. For example, this is handy in order to define specific metadata filters you wish to enable.\n",
+        "\n",
+        "With Redis, this is as simple as defining an index schema object\n",
+        "(from file or dict) and passing it through to the vector store client wrapper."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from redisvl.schema import IndexSchema\n",
+        "\n",
+        "\n",
+        "custom_schema = IndexSchema.from_dict(\n",
+        "    {\n",
+        "        # customize basic index specs\n",
+        "        \"index\": {\n",
+        "            \"name\": \"custom_index\",\n",
+        "            \"prefix\": \"docs\",\n",
+        "            \"key_separator\": \":\",\n",
+        "        },\n",
+        "        # customize fields that are indexed\n",
+        "        \"fields\": [\n",
+        "            # required fields for llamaindex\n",
+        "            {\"type\": \"tag\", \"name\": \"id\"},\n",
+        "            {\"type\": \"tag\", \"name\": \"doc_id\"},\n",
+        "            {\"type\": \"text\", \"name\": \"text\"},\n",
+        "            # custom metadata fields\n",
+        "            {\"type\": \"numeric\", \"name\": \"updated_at\"},\n",
+        "            {\"type\": \"tag\", \"name\": \"file_name\"},\n",
+        "            # custom vector field definition for cohere embeddings\n",
+        "            {\n",
+        "                \"type\": \"vector\",\n",
+        "                \"name\": \"vector\",\n",
+        "                \"attrs\": {\n",
+        "                    \"dims\": 1536,\n",
+        "                    \"algorithm\": \"hnsw\",\n",
+        "                    \"distance_metric\": \"cosine\",\n",
+        "                },\n",
+        "            },\n",
+        "        ],\n",
+        "    }\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "IndexInfo(name='custom_index', prefix='docs', key_separator=':', storage_type=<StorageType.HASH: 'hash'>)"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "custom_schema.index"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'id': TagField(name='id', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n",
+              " 'doc_id': TagField(name='doc_id', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n",
+              " 'text': TextField(name='text', type='text', path=None, attrs=TextFieldAttributes(sortable=False, weight=1, no_stem=False, withsuffixtrie=False, phonetic_matcher=None)),\n",
+              " 'updated_at': NumericField(name='updated_at', type='numeric', path=None, attrs=NumericFieldAttributes(sortable=False)),\n",
+              " 'file_name': TagField(name='file_name', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n",
+              " 'vector': HNSWVectorField(name='vector', type='vector', path=None, attrs=HNSWVectorFieldAttributes(dims=1536, algorithm=<VectorIndexAlgorithm.HNSW: 'HNSW'>, datatype=<VectorDataType.FLOAT32: 'FLOAT32'>, distance_metric=<VectorDistanceMetric.COSINE: 'COSINE'>, initial_cap=None, m=16, ef_construction=200, ef_runtime=10, epsilon=0.01))}"
+            ]
+          },
+          "execution_count": 10,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "custom_schema.fields"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# from datetime import datetime\n",
+        "\n",
+        "\n",
+        "# def date_to_timestamp(date_string: str) -> int:\n",
+        "#     date_format: str = \"%Y-%m-%d\"\n",
+        "#     return int(datetime.strptime(date_string, date_format).timestamp())\n",
+        "\n",
+        "\n",
+        "# # iterate through documents and add new field\n",
+        "# for document in docs:\n",
+        "#     document.metadata[\"updated_at\"] = date_to_timestamp(\n",
+        "#         document.metadata[\"last_modified_date\"]\n",
+        "#     )"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "vector_store = RedisVectorStore(\n",
+        "    schema=custom_schema,  # provide customized schema\n",
+        "    redis_url=REDIS_URL,\n",
+        "    overwrite=True,\n",
+        ")\n",
+        "\n",
+        "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
+        "\n",
+        "# build and load index from documents and storage context\n",
+        "index = VectorStoreIndex.from_documents(\n",
+        "    docs, storage_context=storage_context\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Query the vector store and filter on metadata\n",
+        "Now that we have additional metadata indexed in Redis, let's try some queries which add in filters. As an example, we'll do a search for chunks with the word \"audit\" from an exact file \"amzn-10k-2023.pdf\". "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_index.core.vector_stores import (\n",
+        "    MetadataFilters,\n",
+        "    MetadataFilter,\n",
+        "    ExactMatchFilter,\n",
+        ")\n",
+        "\n",
+        "retriever = index.as_retriever(\n",
+        "    similarity_top_k=3,\n",
+        "    filters=MetadataFilters(\n",
+        "        filters=[\n",
+        "            ExactMatchFilter(key=\"file_name\", value=\"amzn-10k-2023.pdf\"),\n",
+        "            MetadataFilter(\n",
+        "                key=\"text\",\n",
+        "                value=\"audit\",\n",
+        "                operator=\"text_match\",\n",
+        "            ),\n",
+        "        ],\n",
+        "        condition=\"and\",\n",
+        "    ),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Node ID: cd0c5d8f-e3b1-4cbb-aa6a-5960003cdb2d\n",
+            "Text: Table of Contents valuation. In the ordinary course of our\n",
+            "business, there are many transactions and calculations for which the\n",
+            "ultimate tax determination is uncertain. Significant judgment is\n",
+            "required in evaluating and estimating our tax expense, assets, and\n",
+            "liabilities. We are also subject to tax controversies in various\n",
+            "jurisdictions that can...\n",
+            "Score:  0.746\n",
+            "\n",
+            "Node ID: 6745f668-4c7a-43bf-a9c3-9b04e1a497f8\n",
+            "Text: Table of Contents Included in other income (expense), net in\n",
+            "2021 and 2022 is a marketable equity securities valuation gain (loss)\n",
+            "of $11.8 billion and $(12.7) billion from our equity investment in\n",
+            "Rivian Automotive, Inc. (“Rivian”). Our investment in Rivian’s\n",
+            "preferred stock was accounted for at cost, with adjustments for\n",
+            "observable changes in ...\n",
+            "Score:  0.740\n",
+            "\n",
+            "Node ID: 717666fe-fea5-488b-999c-84e6d8b9a0db\n",
+            "Text: Exhibit 31.1 CERTIFICATIONS I, Andrew R. Jassy, certify that: 1.\n",
+            "I have reviewed this Form 10-K of Amazon.com, Inc.; 2. Based on my\n",
+            "knowledge, this report does not contain any untrue statement of a\n",
+            "material fact or omit to state a material fact necessary to make the\n",
+            "statements made, in light of the circumstances under which such\n",
+            "statements were ...\n",
+            "Score:  0.732\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "result_nodes = retriever.retrieve(\"What did the author learn?\")\n",
+        "\n",
+        "for node in result_nodes:\n",
+        "    print(node)"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -376,7 +583,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.9.12"
+      "version": "3.11.9"
     },
     "widgets": {
       "application/vnd.jupyter.widget-state+json": {