add documents listing

mrmer1 · mrmer1 · commit 676590dca210 · 2024-01-28T16:06:14.000+08:00
diff --git a/notebooks/RAG_Chatbot_with_Chat_Embed_Rerank.ipynb b/notebooks/RAG_Chatbot_with_Chat_Embed_Rerank.ipynb
@@ -69,27 +69,65 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import cohere\n",
-    "import os\n",
     "import hnswlib\n",
     "import json\n",
     "import uuid\n",
     "from typing import List, Dict\n",
     "from unstructured.partition.html import partition_html\n",
     "from unstructured.chunking.title import chunk_by_title\n",
     "\n",
+    "import os\n",
     "co = cohere.Client(os.environ[\"COHERE_API_KEY\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "#@title Enable text wrapping in Google colab\n",
     "\n",
@@ -116,9 +154,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/html": [
@@ -274,9 +330,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/html": [
@@ -349,6 +423,7 @@
     "            )\n",
     "            for event in response:\n",
     "                yield event\n",
+    "            yield response\n",
     "\n",
     "        # If there is no search query, directly respond\n",
     "        else:\n",
@@ -400,9 +475,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/html": [
@@ -455,18 +548,27 @@
     "\n",
     "            # Print the chatbot response\n",
     "            print(\"Chatbot:\")\n",
-    "            flag = False\n",
+    "            \n",
+    "            citations_flag = False\n",
+    "            \n",
     "            for event in response:\n",
-    "                # Text\n",
-    "                if event.event_type == \"text-generation\":\n",
+    "                stream_type = type(event).__name__\n",
+    "                    # Text\n",
+    "                if stream_type == \"StreamTextGeneration\":\n",
     "                    print(event.text, end=\"\")\n",
     "\n",
     "                # Citations\n",
-    "                if event.event_type == \"citation-generation\":\n",
-    "                    if not flag:\n",
+    "                if stream_type == \"StreamCitationGeneration\":\n",
+    "                    if not citations_flag:\n",
     "                        print(\"\\n\\nCITATIONS:\")\n",
-    "                        flag = True\n",
-    "                    print(event.citations)\n",
+    "                        citations_flag = True\n",
+    "                    print(event.citations[0])\n",
+    "                    \n",
+    "                if citations_flag:\n",
+    "                    if stream_type == \"StreamingChat\":\n",
+    "                        print(\"\\n\\nDOCUMENTS:\")\n",
+    "                        for document in event.documents:\n",
+    "                            print(document)\n",
     "\n",
     "            print(f\"\\n{'-'*100}\\n\")"
    ]
@@ -481,9 +583,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/html": [
@@ -534,9 +654,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/html": [
@@ -581,9 +719,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "  <style>\n",
+       "    pre {\n",
+       "        white-space: pre-wrap;\n",
+       "    }\n",
+       "  </style>\n",
+       "  "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/html": [
@@ -606,68 +762,28 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "User: Hi there\n",
-      "Chatbot:\n",
-      "Hi there! I'm Coral, an AI-assistant chatbot trained to assist human users by providing thorough responses. Is there anything I can help you with today?\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "\n",
-      "User: What are text embeddings\n",
-      "Chatbot:\n",
-      "Retrieving information...\n",
-      "Text embeddings are used to convert words into numbers, creating a vector of numerical data for every piece of text. If the vectors for two pieces of text are similar, this means that the corresponding pieces of text are similar too, and vice versa. Text embeddings are particularly useful for tasks like machine translation and searching for text in different languages. \n",
-      "\n",
-      "Would you like me to go into more detail about any of the information mentioned above?\n",
-      "\n",
-      "CITATIONS:\n",
-      "[{'start': 28, 'end': 54, 'text': 'convert words into numbers', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 67, 'end': 116, 'text': 'vector of numerical data for every piece of text.', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 136, 'end': 166, 'text': 'two pieces of text are similar', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 188, 'end': 249, 'text': 'corresponding pieces of text are similar too, and vice versa.', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 305, 'end': 324, 'text': 'machine translation', 'document_ids': ['doc_2']}]\n",
-      "[{'start': 329, 'end': 371, 'text': 'searching for text in different languages.', 'document_ids': ['doc_2']}]\n",
-      "\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "\n",
-      "User: What are they useful for\n",
+      "User: embedding\n",
       "Chatbot:\n",
       "Retrieving information...\n",
-      "Text embeddings are useful for a variety of tasks, including:\n",
-      "\n",
-      "- Machine learning - for example, transformer models can be used to write stories, essays and poems, as well as answer questions and chat with humans.\n",
-      "- Machine translation - text embeddings can translate text from one language to another.\n",
-      "- Searching for text - text embeddings can be used to find text in any language. \n",
-      "\n",
-      "Would you like me to go into more detail about any of the tasks text embeddings are useful for?\n",
+      "Embedding is a way to locate each sentence in space, in a way that similar sentences are located close by. It associates each sentence with a particular list of numbers (a vector). Word and sentence embeddings are the bread and butter of language models. In Chapter 9 of the Cohere documentation, there is an example of trying to fit a French sentence (\"Bonjour, comment ça va?\") into an embedding and the struggle it will have to understand that it should be close to the English sentence (\"Hello, how are you?\"). Cohere has trained a large multilingual model to unify many languages into one and be able to understand text in all those languages.\n",
       "\n",
       "CITATIONS:\n",
-      "[{'start': 65, 'end': 81, 'text': 'Machine learning', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 97, 'end': 115, 'text': 'transformer models', 'document_ids': ['doc_0', 'doc_1']}]\n",
-      "[{'start': 131, 'end': 144, 'text': 'write stories', 'document_ids': ['doc_0', 'doc_1']}]\n",
-      "[{'start': 146, 'end': 152, 'text': 'essays', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 157, 'end': 162, 'text': 'poems', 'document_ids': ['doc_0', 'doc_1']}]\n",
-      "[{'start': 175, 'end': 191, 'text': 'answer questions', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 196, 'end': 213, 'text': 'chat with humans.', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 216, 'end': 235, 'text': 'Machine translation', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 258, 'end': 302, 'text': 'translate text from one language to another.', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 305, 'end': 323, 'text': 'Searching for text', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 370, 'end': 383, 'text': 'any language.', 'document_ids': ['doc_1']}]\n",
+      "{'start': 22, 'end': 51, 'text': 'locate each sentence in space', 'document_ids': ['doc_1']}\n",
+      "{'start': 67, 'end': 106, 'text': 'similar sentences are located close by.', 'document_ids': ['doc_1']}\n",
+      "{'start': 110, 'end': 168, 'text': 'associates each sentence with a particular list of numbers', 'document_ids': ['doc_1']}\n",
+      "{'start': 169, 'end': 179, 'text': '(a vector)', 'document_ids': ['doc_1']}\n",
+      "{'start': 181, 'end': 254, 'text': 'Word and sentence embeddings are the bread and butter of language models.', 'document_ids': ['doc_2']}\n",
+      "{'start': 258, 'end': 295, 'text': 'Chapter 9 of the Cohere documentation', 'document_ids': ['doc_0']}\n",
+      "{'start': 320, 'end': 351, 'text': 'trying to fit a French sentence', 'document_ids': ['doc_0']}\n",
+      "{'start': 402, 'end': 489, 'text': 'the struggle it will have to understand that it should be close to the English sentence', 'document_ids': ['doc_0']}\n",
+      "{'start': 515, 'end': 593, 'text': 'Cohere has trained a large multilingual model to unify many languages into one', 'document_ids': ['doc_0']}\n",
+      "{'start': 598, 'end': 648, 'text': 'be able to understand text in all those languages.', 'document_ids': ['doc_0']}\n",
       "\n",
-      "----------------------------------------------------------------------------------------------------\n",
       "\n",
-      "User: How do you generate them\n",
-      "Chatbot:\n",
-      "Retrieving information...\n",
-      "Text embeddings are generated by training transformer models on large datasets, such as the entire internet or large datasets of conversations. Post-training helps improve the model's performance on specific tasks by focusing on datasets corresponding to questions and answers or conversations. \n",
-      "\n",
-      "Would you like me to go into more detail about transformer models?\n",
-      "\n",
-      "CITATIONS:\n",
-      "[{'start': 42, 'end': 60, 'text': 'transformer models', 'document_ids': ['doc_0', 'doc_1', 'doc_2']}]\n",
-      "[{'start': 92, 'end': 107, 'text': 'entire internet', 'document_ids': ['doc_0']}]\n",
-      "[{'start': 129, 'end': 143, 'text': 'conversations.', 'document_ids': ['doc_1']}]\n",
-      "[{'start': 144, 'end': 195, 'text': \"Post-training helps improve the model's performance\", 'document_ids': ['doc_0', 'doc_1']}]\n",
-      "[{'start': 255, 'end': 276, 'text': 'questions and answers', 'document_ids': ['doc_0']}]\n",
-      "[{'start': 280, 'end': 294, 'text': 'conversations.', 'document_ids': ['doc_1']}]\n",
+      "DOCUMENTS:\n",
+      "{'id': 'doc_1', 'text': 'In the previous chapter, we learned that sentence embeddings are the bread and butter of language models, as they associate each sentence with a particular list of numbers (a vector), in a way that similar sentences give similar vectors. We can think of embeddings as a way to locate each sentence in space (a high dimensional space, but a space nonetheless), in a way that similar sentences are located close by. Once we have each sentence somewhere in space, it’s natural to think of distances betw', 'title': 'Similarity Between Words and Sentences', 'url': 'https://docs.cohere.com/docs/similarity-between-words-and-sentences'}\n",
+      "{'id': 'doc_2', 'text': 'Text Embeddings\\n\\nWord and sentence embeddings are the bread and butter of language models. This chapter shows a very simple introduction to what they are.', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/docs/text-embeddings'}\n",
+      "{'id': 'doc_0', 'text': 'Most word and sentence embeddings are dependent on the language that the model is trained on. If you were to try to fit the French sentence “Bonjour, comment ça va?” (meaning: hello, how are you?) in the embedding from the previous section, it will struggle to understand that it should be close to the sentence “Hello, how are you?” in English. For the purpose of unifying many languages into one, and being able to understand text in all these languages, Cohere has trained a large multilingual mod', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/docs/text-embeddings'}\n",
       "\n",
       "----------------------------------------------------------------------------------------------------\n",
       "\n",
@@ -683,7 +799,7 @@
     "app = App(chatbot)\n",
     "\n",
     "# Run the chatbot\n",
-    "app.run()"
+    "app.run()\n"
    ]
   },
   {