truncate documents text listing for brevity

mrmer1 · mrmer1 · commit 781b2e1f4bb3 · 2024-01-30T15:29:29.000+08:00
diff --git a/notebooks/RAG_Chatbot_with_Chat_Embed_Rerank.ipynb b/notebooks/RAG_Chatbot_with_Chat_Embed_Rerank.ipynb
@@ -69,28 +69,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import cohere\n",
     "import hnswlib\n",
@@ -100,33 +81,16 @@
     "from unstructured.partition.html import partition_html\n",
     "from unstructured.chunking.title import chunk_by_title\n",
     "\n",
-    "co = cohere.Client(\"COHERE_API_KEY\")"
+    "# co = cohere.Client(\"COHERE_API_KEY\")\n",
+    "import os\n",
+    "co = cohere.Client(os.getenv(\"COHERE_API_KEY\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "#@title Enable text wrapping in Google colab\n",
     "\n",
@@ -153,27 +117,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -329,27 +275,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -459,12 +387,6 @@
     "        for query in queries:\n",
     "            retrieved_docs.extend(self.docs.retrieve(query))\n",
     "\n",
-    "        # # Uncomment this code block to display the chatbot's retrieved documents\n",
-    "        # print(\"DOCUMENTS RETRIEVED:\")\n",
-    "        # for idx, doc in enumerate(retrieved_docs):\n",
-    "        #     print(f\"doc_{idx}: {doc}\")\n",
-    "        # print(\"\\n\")\n",
-    "\n",
     "        return retrieved_docs"
    ]
   },
@@ -478,27 +400,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -570,8 +474,10 @@
     "                if citations_flag:\n",
     "                    if stream_type == \"StreamingChat\":\n",
     "                        print(\"\\n\\nDOCUMENTS:\")\n",
-    "                        for document in event.documents:\n",
-    "                            print(document)\n",
+    "                        documents = [{'id': doc['id'], 'text': doc['text'][:50] + '...', 'title': doc['title'], 'url': doc['url']} \n",
+    "                                     for doc in event.documents]\n",
+    "                        for doc in documents:\n",
+    "                            print(doc)\n",
     "\n",
     "            print(f\"\\n{'-'*100}\\n\")"
    ]
@@ -586,27 +492,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -657,27 +545,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -722,27 +592,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "  <style>\n",
-       "    pre {\n",
-       "        white-space: pre-wrap;\n",
-       "    }\n",
-       "  </style>\n",
-       "  "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -765,56 +617,36 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "User: hello\n",
+      "User: Hello\n",
       "Chatbot:\n",
-      "Hello to you as well! How can I help you today?\n",
+      "Hello there, how can I assist you today?\n",
       "\n",
-      "Let me know if there's something specific you would like to discuss or any questions you have and I'll do my best to assist you.\n",
+      "Please provide some further information or give me a specific question, and I'll do my best to help you out!\n",
       "----------------------------------------------------------------------------------------------------\n",
       "\n",
-      "User: what is the difference between word and sentence embeddings\n",
+      "User: What is the difference between word and sentence embeddings\n",
       "Chatbot:\n",
       "Retrieving information...\n",
-      "Word embeddings and sentence embeddings are the fundamental components of LLMs and transform language into computer-readable numbers. \n",
+      "Word embeddings and sentence embeddings are the fundamental components of LLMs and convert language (words) into computer speak (numbers) in a way that preserves the relationships between words, semantics, and linguistic nuances into numerical equations. \n",
       "\n",
-      "Word embeddings associate words with lists of numbers (vectors) in a way that similar words are grouped close together. Sentence embeddings do the same thing but for sentences. It associates vectors to every sentence.\n",
+      "Word embeddings associate words with lists of numbers (vectors) in a way that groups similar words together. Sentence embeddings do the same thing but for sentences.\n",
       "\n",
       "CITATIONS:\n",
       "{'start': 0, 'end': 15, 'text': 'Word embeddings', 'document_ids': ['doc_0', 'doc_1', 'doc_2']}\n",
       "{'start': 20, 'end': 39, 'text': 'sentence embeddings', 'document_ids': ['doc_0', 'doc_1', 'doc_2']}\n",
       "{'start': 48, 'end': 78, 'text': 'fundamental components of LLMs', 'document_ids': ['doc_2']}\n",
-      "{'start': 83, 'end': 133, 'text': 'transform language into computer-readable numbers.', 'document_ids': ['doc_2']}\n",
-      "{'start': 136, 'end': 255, 'text': 'Word embeddings associate words with lists of numbers (vectors) in a way that similar words are grouped close together.', 'document_ids': ['doc_0']}\n",
-      "{'start': 256, 'end': 312, 'text': 'Sentence embeddings do the same thing but for sentences.', 'document_ids': ['doc_0', 'doc_1']}\n",
-      "{'start': 316, 'end': 353, 'text': 'associates vectors to every sentence.', 'document_ids': ['doc_0', 'doc_1']}\n",
+      "{'start': 83, 'end': 137, 'text': 'convert language (words) into computer speak (numbers)', 'document_ids': ['doc_2']}\n",
+      "{'start': 152, 'end': 228, 'text': 'preserves the relationships between words, semantics, and linguistic nuances', 'document_ids': ['doc_2']}\n",
+      "{'start': 234, 'end': 254, 'text': 'numerical equations.', 'document_ids': ['doc_2']}\n",
+      "{'start': 257, 'end': 365, 'text': 'Word embeddings associate words with lists of numbers (vectors) in a way that groups similar words together.', 'document_ids': ['doc_0', 'doc_1']}\n",
+      "{'start': 366, 'end': 422, 'text': 'Sentence embeddings do the same thing but for sentences.', 'document_ids': ['doc_0', 'doc_1']}\n",
       "\n",
       "\n",
       "DOCUMENTS:\n",
-      "{'id': 'doc_0', 'text': 'In the previous chapters, you learned about word and sentence embeddings and similarity between words and sentences. In short, a word embedding is a way to associate words with lists of numbers (vectors) in such a way that similar words are associated with numbers that are close by, and dissimilar words with numbers that are far away from each other. A sentence embedding does the same thing, but associating a vector to every sentence. Similarity is a way to measure how similar two words (or sent', 'title': 'The Attention Mechanism', 'url': 'https://docs.cohere.com/docs/the-attention-mechanism'}\n",
-      "{'id': 'doc_1', 'text': 'This is where sentence embeddings come into play. A sentence embedding is just like a word embedding, except it associates every sentence with a vector full of numbers, in a coherent way. By coherent, I mean that it satisfies similar properties as a word embedding. For instance, similar sentences are assigned to similar vectors, different sentences are assigned to different vectors, and most importantly, each of the coordinates of the vector identifies some (whether clear or obscure) property of', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/docs/text-embeddings'}\n",
-      "{'id': 'doc_2', 'text': 'Conclusion\\n\\nWord and sentence embeddings are the bread and butter of LLMs. They are the basic building block of most language models, since they translate human speak (words) into computer speak (numbers) in a way that captures many relations between words, semantics, and nuances of the language, into equations regarding the corresponding numbers.', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/docs/text-embeddings'}\n",
-      "\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "\n",
-      "User: continue\n",
-      "Chatbot:\n",
-      "In some models, the same underlying architecture is used for both word and sentence embeddings. Word embeddings generate vector representations for individual words, while sentence embeddings generate vector representations for entire sentences or phrases.\n",
-      "\n",
-      "Here's a simple analogy: \n",
-      "\n",
-      "Word embeddings are like individuals creating unique fingerprints, identifying unique characteristics. Sentence embeddings are like creating a unique fingerprint for each sentence or phrase. \n",
-      "\n",
-      "Although the concept is similar, the processes are different as word embeddings focus on individual words, while sentence embeddings focus on the entire sentence and capture the overall meaning or context.\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "\n",
-      "User: what do you know about graph neural networks\n",
-      "Chatbot:\n",
-      "Retrieving information...\n",
-      "I cannot find any specific information on Graph Neural Networks, however, I can provide some information on Transformer Models which are another type of neural network. \n",
-      "\n",
-      " Transformer models are a type of neural network that utilizes attention mechanisms to parse input sequences into multiple layers, with each layer assigning attention weights to the previous layer. Introduced in the paper \"Attention is All You Need\", they have become one of the key components in many NLP applications and are highly effective due to their ability to handle long-range dependencies and capture contextual information.\n",
+      "{'id': 'doc_0', 'text': 'In the previous chapters, you learned about word a...', 'title': 'The Attention Mechanism', 'url': 'https://docs.cohere.com/docs/the-attention-mechanism'}\n",
+      "{'id': 'doc_1', 'text': 'This is where sentence embeddings come into play. ...', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/docs/text-embeddings'}\n",
+      "{'id': 'doc_2', 'text': 'Conclusion\\n\\nWord and sentence embeddings are the b...', 'title': 'Text Embeddings', 'url': 'https://docs.cohere.com/docs/text-embeddings'}\n",
       "\n",
-      "Would you like me to provide more information on Transformer Models or explain other types of neural networks?\n",
       "----------------------------------------------------------------------------------------------------\n",
       "\n",
       "Ending chat.\n"