diff --git a/your-code/.gitignore b/your-code/.gitignore new file mode 100644 index 0000000..47e892c --- /dev/null +++ b/your-code/.gitignore @@ -0,0 +1,2 @@ +.env +chroma_db_LAB/ diff --git a/your-code/main.ipynb b/your-code/main.ipynb index e3a225a..4831b5e 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -59,27 +59,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "%pip install langchain langchain_community pypdf\n", - "%pip install termcolor langchain_openai langchain-huggingface sentence-transformers chromadb langchain_chroma tiktoken openai python-dotenv\n" + "#pip install langchain langchain_community pypdf\n", + "#pip install termcolor langchain_openai langchain-huggingface sentence-transformers chromadb langchain_chroma tiktoken openai python-dotenv\n", + "#pip install langchain_community pypdf\n", + "\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6heKZkQUxYZr" - }, + "execution_count": 5, + "metadata": {}, "outputs": [], "source": [ - "import os\n", - "from langchain.document_loaders import PyPDFLoader\n", + "from langchain_community.document_loaders import PyPDFLoader\n", "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", + "import os\n", "import warnings\n", - "warnings.filterwarnings('ignore')\n" + "\n", + "warnings.filterwarnings('ignore')" ] }, { @@ -96,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "cuREtJRixYZt" }, @@ -104,7 +105,7 @@ "source": [ "# File path for the document\n", "\n", - "file_path = \"LAB/ai-for-everyone.pdf\"" + "file_path = \"../ai-for-everyone.pdf\"" ] }, { @@ -122,12 +123,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "_b5Z_45UxYZu", "outputId": "a600d69f-14fe-4492-f236-97261d6ff36c" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "297" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Load and split the document\n", "loader = PyPDFLoader(file_path)\n", @@ -168,9 +180,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1096" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000,\n", @@ -285,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "L0xDxElwxYZw" }, @@ -297,19 +320,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "_WRIo3_0xYZx", "outputId": "78bfbbf3-9d25-4e31-bdbc-3e932e6bbfec" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "load_dotenv()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "MNZfTng5xYZz", "outputId": "db1a7c85-ef9f-447e-92cd-9d097e959847" @@ -343,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "id": "brKe6wUgxYZ0" }, @@ -354,12 +388,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "id": "VkjHR-RkxYZ0", "outputId": "bc11bda9-f283-457a-f584-5a06b95c4dd9" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChromaDB created with document embeddings.\n" + ] + } + ], "source": [ "db = Chroma.from_documents(chunks, embeddings, persist_directory=\"./chroma_db_LAB\")\n", "print(\"ChromaDB created with document embeddings.\")" @@ -383,24 +425,77 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "id": "XiLv-TfrxYZ1" }, "outputs": [], "source": [ - "user_question = \"\" # User question\n", + "user_question = \"How to survive a zombie apocalypse?\" \n", "retrieved_docs = db.similarity_search(user_question, k=10) # k is the number of documents to retrieve" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "id": "qgWsh50JxYZ1", "outputId": "c8640c5d-5955-471f-fdd2-37096f5f68c7" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "doi.org/10.1177/2056305118768303.\n", + "Mitchell, A. 2015. Posthumanist Post-Colonialism? Worldly (blog). 26 Feb -\n", + "ruary 2015. https://worldlyir.wordpress.com/2015/02/26/posthumanist \n", + "-postcolonialism\n", + "Nielsen, M. A. 2015. Neural Networks and Deep Learning. http://neural \n", + "networksanddeeplearning.com.\n", + "Nietzsche, F . 1998. Twilight of the Idols. Reissue edition. Oxford: Oxford Uni -\n", + "versity Press.\n", + "NoBodyIsDisposable. 2020. Open Letter to Care Providers and Hospitals. \n", + "March 2020. https://nobodyisdisposable.org/open-letter\n", + "Pazos, A. 2018. Ours to Master and to Own – We Visit Viome, Greece’s Only \n", + "Worker-Managed Factory. Jacobin, 10 June 2018. https://jacobinmag \n", + ".com/2018/10/viome-self-management-factory-takeover-greece.\n", + "Sanzo, K. 2018. New Materialism(s). Critical Posthumanism Network (blog). 25 \n", + "April 2018. http://criticalposthumanism.net/new-materialisms\n", + "Document 2:\n", + "doi.org/10.1177/2056305118768303.\n", + "Mitchell, A. 2015. Posthumanist Post-Colonialism? Worldly (blog). 26 Feb -\n", + "ruary 2015. https://worldlyir.wordpress.com/2015/02/26/posthumanist \n", + "-postcolonialism\n", + "Nielsen, M. A. 2015. Neural Networks and Deep Learning. http://neural \n", + "networksanddeeplearning.com.\n", + "Nietzsche, F . 1998. Twilight of the Idols. Reissue edition. Oxford: Oxford Uni -\n", + "versity Press.\n", + "NoBodyIsDisposable. 2020. Open Letter to Care Providers and Hospitals. \n", + "March 2020. https://nobodyisdisposable.org/open-letter\n", + "Pazos, A. 2018. Ours to Master and to Own – We Visit Viome, Greece’s Only \n", + "Worker-Managed Factory. Jacobin, 10 June 2018. https://jacobinmag \n", + ".com/2018/10/viome-self-management-factory-takeover-greece.\n", + "Sanzo, K. 2018. New Materialism(s). Critical Posthumanism Network (blog). 25 \n", + "April 2018. http://criticalposthumanism.net/new-materialisms\n", + "Document 3:\n", + " the planet, narratives that we can already \n", + "see emerging as a neoliberal and fascist reaction to the COVID-19 pandemic \n", + "and as Malthusian responses to climate change.\n", + "As a rule of thumb, we should examine every situation where AI is being \n", + "offered as a solution and ask how on-the-ground collective action might enable \n", + "a radical commoning of both risks and resources. Instead of a technocratic \n", + "solution to precarious labour, for example, that imposes some spurious metric \n", + "of fairness on a structure that embodies injustice, we look to a complete sociali-\n", + "sation of the relations and materialities involved. This happens, for example, \n", + "when workers react to layoffs by occupying their workplaces and transforming \n", + "material production in collaboration with the local community (Pazos 2018). \n", + "The only material-discursive politics consistent with a cosmopolitical care is a \n", + "radical commoning.\n" + ] + } + ], "source": [ "# Display top results\n", "for i, doc in enumerate(retrieved_docs[:3]): # Display top 3 results\n", @@ -418,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "id": "2iB3lZqHxYZ2" }, @@ -434,12 +529,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "id": "2okzmuADxYZ2", "outputId": "0aa6cdca-188d-40e0-f5b4-8888d3549ea4" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Context formatted for GPT model.\n" + ] + } + ], "source": [ "# Generate a formatted context from the retrieved documents\n", "formatted_context = _get_document_prompt(retrieved_docs)\n", @@ -464,28 +567,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "id": "tqxVh9s3xYZ3", "outputId": "97cca95d-4ab3-44d8-a76c-5713aad387d8" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt constructed.\n" + ] + } + ], "source": [ "prompt = f\"\"\"\n", - "\n", - "\n", - "\"\"\"\n" + "## SYSTEM ROLE\n", + "You are a knowledgeable and factual chatbot designed to assist with questions about **Apocalypse**, specifically focusing on **human survival**.\n", + "Your answers must be based exclusively on provided content from technical books provided.\n", + "\n", + "## USER QUESTION\n", + "The user has asked:\n", + "\"{user_question}\"\n", + "\n", + "## CONTEXT\n", + "Here is the relevant content from the technical books:\n", + "'''\n", + "{formatted_context}\n", + "'''\n", + "\n", + "## GUIDELINES\n", + "1. **Accuracy**:\n", + " - Only use the content in the `CONTEXT` section to answer.\n", + " - If the answer cannot be found, explicitly state: \"The provided context does not contain this information.\"\n", + " - Start explain how to survive a zombie apocalypse.\n", + " - How to find safe shelter during a zombie apocalypse.\n", + " - What essential supplies are needed for survival?\n", + " - Strategies for avoiding zombie encounters.\n", + " - Methods for securing food and water.\n", + " - Communication techniques with other survivors.\n", + " - Ways to maintain mental health during prolonged isolation.\n", + " \n", + "\n", + "2. **Transparency**:\n", + " - Reference the book's name and page numbers when providing information.\n", + " - Do not speculate or provide opinions.\n", + "\n", + "3. **Clarity**:\n", + " - Use simple, professional, and concise language.\n", + " - Format your response in Markdown for readability.\n", + "\n", + "## TASK\n", + "1. Answer the user's question **directly** if possible.\n", + "2. Point the user to relevant parts of the documentation.\n", + "3. Provide the response in the following format:\n", + "\n", + "## RESPONSE FORMAT\n", + "'''\n", + "# [Brief Title of the Answer]\n", + "[Answer in simple, clear text.]\n", + "\n", + "**Source**:\n", + "• [Book Title], Page(s): [...]\n", + "'''\n", + "\"\"\"\n", + "print(\"Prompt constructed.\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "id": "0mjkQJ_ZxYZ3" }, "outputs": [], "source": [ - "import openai" + "from openai import OpenAI\n", + "client = OpenAI()\n" ] }, { @@ -497,21 +656,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "id": "ylypRWRlxYZ4" }, "outputs": [], "source": [ "# Set up GPT client and parameters\n", - "client = openai.OpenAI()\n", "model_params = {\n", " 'model': 'gpt-4o',\n", - " 'temperature': , # Increase creativity\n", - " 'max_tokens': , # Allow for longer responses\n", - " 'top_p': , # Use nucleus sampling\n", - " 'frequency_penalty': , # Reduce repetition\n", - " 'presence_penalty': # Encourage new topics\n", + " 'temperature': 0.7 , # Increase creativity\n", + " 'max_tokens': 4000, # Allow for longer responses\n", + " 'top_p': 0.9, # Use nucleus sampling\n", + " 'frequency_penalty': 0.5, # Reduce repetition\n", + " 'presence_penalty': 0.6 # Encourage new topics\n", "}" ] }, @@ -526,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "id": "4eXZO4pIxYZ4" }, @@ -538,12 +696,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "id": "wLPAcchBxYZ5", "outputId": "976c7800-16ed-41fe-c4cf-58f60d3230d2" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'''\n", + "# Surviving a Zombie Apocalypse\n", + "\n", + "The provided context does not contain specific information about surviving a zombie apocalypse, including strategies for finding safe shelter, essential supplies needed, avoiding encounters, securing food and water, communicating with other survivors, or maintaining mental health during prolonged isolation.\n", + "\n", + "**Source**:\n", + "• The provided context does not include this information.\n", + "'''\n" + ] + } + ], "source": [ "answer = completion.choices[0].message.content\n", "print(answer)" @@ -595,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "id": "nCXL9Cz1xYaV" }, @@ -615,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "id": "9y3E0YWExYaV" }, @@ -641,9 +814,21 @@ "id": "i7SkWPpnxYaW", "outputId": "28e82563-edba-4b41-acad-ec27e5ba134f" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Snippet 1:\n", + "Media + Society 4 (2). DOI: https://doi.org/10.1177/2056305118768303.\n", + "Mitchell, A. 2015. Posthumanist Post-Colonialism? Worldly (blog). 26 Feb -\n", + "ruary 2015. https://worldlyir.wordpress.com/2015/02/26/\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], "source": [ - "query_keywords = [] # add your keywords\n", + "query_keywords = [\"zombie\", \"apocalypse\", \"survival\"] # add your keywords\n", "for i, doc in enumerate(retrieved_docs[:1]):\n", " snippet = doc.page_content[:200]\n", " highlighted = highlight_keywords(snippet, query_keywords)\n", @@ -687,7 +872,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "llm", + "display_name": "nlp", "language": "python", "name": "python3" }, @@ -701,7 +886,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.11.14" } }, "nbformat": 4,