|
6 | 6 | "id": "LERqQn5v8-ak"
|
7 | 7 | },
|
8 | 8 | "source": [
|
9 |
| - "# **Getting to know Llama 2: Everything you need to start building**\n", |
10 |
| - "Our goal in this session is to provide a guided tour of Llama 2, including understanding different Llama 2 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 2 projects." |
| 9 | + "# **Getting to know Llama 3: Everything you need to start building**\n", |
| 10 | + "Our goal in this session is to provide a guided tour of Llama 3, including understanding different Llama 3 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 3 projects." |
11 | 11 | ]
|
12 | 12 | },
|
13 | 13 | {
|
|
58 | 58 | " A[Users] --> B(Applications e.g. mobile, web)\n",
|
59 | 59 | " B --> |Hosted API|C(Platforms e.g. Custom, OctoAI, HuggingFace, Replicate)\n",
|
60 | 60 | " B -- optional --> E(Frameworks e.g. LangChain)\n",
|
61 |
| - " C-->|User Input|D[Llama 2]\n", |
| 61 | + " C-->|User Input|D[Llama 3]\n", |
62 | 62 | " D-->|Model Output|C\n",
|
63 | 63 | " E --> C\n",
|
64 | 64 | " classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
|
|
69 | 69 | " flowchart TD\n",
|
70 | 70 | " A[User Prompts] --> B(Frameworks e.g. LangChain)\n",
|
71 | 71 | " B <--> |Database, Docs, XLS|C[fa:fa-database External Data]\n",
|
72 |
| - " B -->|API|D[Llama 2]\n", |
| 72 | + " B -->|API|D[Llama 3]\n", |
73 | 73 | " classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
|
74 | 74 | " \"\"\")\n",
|
75 | 75 | "\n",
|
76 |
| - "def llama2_family():\n", |
| 76 | + "def llama3_family():\n", |
77 | 77 | " mm(\"\"\"\n",
|
78 | 78 | " graph LR;\n",
|
79 |
| - " llama-2 --> llama-2-7b\n", |
80 |
| - " llama-2 --> llama-2-13b\n", |
81 |
| - " llama-2 --> llama-2-70b\n", |
82 |
| - " llama-2-7b --> llama-2-7b-chat\n", |
83 |
| - " llama-2-13b --> llama-2-13b-chat\n", |
84 |
| - " llama-2-70b --> llama-2-70b-chat\n", |
| 79 | + " llama-3 --> llama-3-8b-instruct\n", |
| 80 | + " llama-3 --> llama-3-70b-instruct\n", |
85 | 81 | " classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
|
86 | 82 | " \"\"\")\n",
|
87 | 83 | "\n",
|
|
91 | 87 | " users --> apps\n",
|
92 | 88 | " apps --> frameworks\n",
|
93 | 89 | " frameworks --> platforms\n",
|
94 |
| - " platforms --> Llama 2\n", |
| 90 | + " platforms --> Llama 3\n", |
95 | 91 | " classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
|
96 | 92 | " \"\"\")\n",
|
97 | 93 | "\n",
|
|
115 | 111 | " user --> prompt\n",
|
116 | 112 | " prompt --> i_safety\n",
|
117 | 113 | " i_safety --> context\n",
|
118 |
| - " context --> Llama_2\n", |
119 |
| - " Llama_2 --> output\n", |
| 114 | + " context --> Llama_3\n", |
| 115 | + " Llama_3 --> output\n", |
120 | 116 | " output --> o_safety\n",
|
121 | 117 | " i_safety --> memory\n",
|
122 | 118 | " o_safety --> memory\n",
|
|
165 | 161 | "id": "i4Np_l_KtIno"
|
166 | 162 | },
|
167 | 163 | "source": [
|
168 |
| - "##**1 - Understanding Llama 2**" |
| 164 | + "##**1 - Understanding Llama 3**" |
169 | 165 | ]
|
170 | 166 | },
|
171 | 167 | {
|
|
174 | 170 | "id": "PGPSI3M5PGTi"
|
175 | 171 | },
|
176 | 172 | "source": [
|
177 |
| - "### **1.1 - What is Llama 2?**\n", |
| 173 | + "### **1.1 - What is Llama 3?**\n", |
178 | 174 | "\n",
|
179 | 175 | "* State of the art (SOTA), Open Source LLM\n",
|
180 |
| - "* 7B, 13B, 70B\n", |
| 176 | + "* Llama 3 8B, 70B\n", |
181 | 177 | "* Pretrained + Chat\n",
|
182 | 178 | "* Choosing model: Size, Quality, Cost, Speed\n",
|
183 |
| - "* [Research paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n", |
184 |
| - "\n", |
| 179 | + "* [Llama 3 blog](https://ai.meta.com/blog/meta-llama-3/)\n", |
185 | 180 | "* [Responsible use guide](https://ai.meta.com/llama/responsible-use-guide/)"
|
186 | 181 | ]
|
187 | 182 | },
|
|
208 | 203 | },
|
209 | 204 | "outputs": [],
|
210 | 205 | "source": [
|
211 |
| - "llama2_family()" |
| 206 | + "llama3_family()" |
212 | 207 | ]
|
213 | 208 | },
|
214 | 209 | {
|
|
217 | 212 | "id": "aYeHVVh45bdT"
|
218 | 213 | },
|
219 | 214 | "source": [
|
220 |
| - "###**1.2 - Accessing Llama 2**\n", |
| 215 | + "###**1.2 - Accessing Llama 3**\n", |
221 | 216 | "* Download + Self Host (on-premise)\n",
|
222 | 217 | "* Hosted API Platform (e.g. [OctoAI](https://octoai.cloud/), [Replicate](https://replicate.com/meta))\n",
|
223 |
| - "* Hosted Container Platform (e.g. [Azure](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/introducing-llama-2-on-azure/ba-p/3881233), [AWS](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/), [GCP](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139))\n", |
224 |
| - "\n" |
| 218 | + "* Hosted Container Platform (e.g. [Azure](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/introducing-llama-2-on-azure/ba-p/3881233), [AWS](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/), [GCP](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139))" |
225 | 219 | ]
|
226 | 220 | },
|
227 | 221 | {
|
|
230 | 224 | "id": "kBuSay8vtzL4"
|
231 | 225 | },
|
232 | 226 | "source": [
|
233 |
| - "### **1.3 - Use Cases of Llama 2**\n", |
| 227 | + "### **1.3 - Use Cases of Llama 3**\n", |
234 | 228 | "* Content Generation\n",
|
235 | 229 | "* Chatbots\n",
|
236 | 230 | "* Summarization\n",
|
|
245 | 239 | "id": "sd54g0OHuqBY"
|
246 | 240 | },
|
247 | 241 | "source": [
|
248 |
| - "##**2 - Using Llama 2**\n", |
| 242 | + "##**2 - Using Llama 3**\n", |
249 | 243 | "\n",
|
250 |
| - "In this notebook, we are going to access [Llama 13b chat model](https://octoai.cloud/tools/text/chat?mode=demo&model=llama-2-13b-chat-fp16) using hosted API from OctoAI." |
| 244 | + "In this notebook, we are going to access [Llama 3 8b instruct model](https://octoai.cloud/text/chat?model=meta-llama-3-8b-instruct&mode=api) using hosted API from OctoAI." |
251 | 245 | ]
|
252 | 246 | },
|
253 | 247 | {
|
|
269 | 263 | "source": [
|
270 | 264 | "# Install dependencies and initialize\n",
|
271 | 265 | "%pip install -qU \\\n",
|
272 |
| - " octoai-sdk \\\n", |
273 |
| - " langchain \\\n", |
| 266 | + " langchain==0.1.19 \\\n", |
| 267 | + " octoai-sdk==0.10.1 \\\n", |
| 268 | + " openai \\\n", |
274 | 269 | " sentence_transformers \\\n",
|
275 | 270 | " pdf2image \\\n",
|
276 | 271 | " pdfminer \\\n",
|
|
292 | 287 | "outputs": [],
|
293 | 288 | "source": [
|
294 | 289 | "# model on OctoAI platform that we will use for inferencing\n",
|
295 |
| - "# We will use llama 13b chat model hosted on OctoAI server ()\n", |
| 290 | + "# We will use llama 3 8b instruct model hosted on OctoAI server\n", |
296 | 291 | "\n",
|
297 |
| - "llama2_13b = \"llama-2-13b-chat-fp16\"" |
| 292 | + "llama3_8b = \"meta-llama-3-8b-instruct\"" |
298 | 293 | ]
|
299 | 294 | },
|
300 | 295 | {
|
|
326 | 321 | },
|
327 | 322 | "outputs": [],
|
328 | 323 | "source": [
|
329 |
| - "# we will use OctoAI's hosted API\n", |
330 |
| - "from octoai.client import Client\n", |
| 324 | + "# We will use OpenAI's APIs to talk to OctoAI's hosted model endpoint\n", |
| 325 | + "from openai import OpenAI\n", |
331 | 326 | "\n",
|
332 |
| - "client = Client(OCTOAI_API_TOKEN)\n", |
| 327 | + "client = OpenAI(\n", |
| 328 | + " base_url = \"https://text.octoai.run/v1\",\n", |
| 329 | + " api_key = os.environ[\"OCTOAI_API_TOKEN\"]\n", |
| 330 | + ")\n", |
333 | 331 | "\n",
|
334 | 332 | "# text completion with input prompt\n",
|
335 | 333 | "def Completion(prompt):\n",
|
336 | 334 | " output = client.chat.completions.create(\n",
|
337 | 335 | " messages=[\n",
|
338 |
| - " {\n", |
339 |
| - " \"role\": \"user\",\n", |
340 |
| - " \"content\": prompt\n", |
341 |
| - " }\n", |
| 336 | + " {\"role\": \"user\", \"content\": prompt}\n", |
342 | 337 | " ],\n",
|
343 |
| - " model=\"llama-2-13b-chat-fp16\",\n", |
| 338 | + " model=llama3_8b,\n", |
344 | 339 | " max_tokens=1000\n",
|
345 | 340 | " )\n",
|
346 | 341 | " return output.choices[0].message.content\n",
|
|
349 | 344 | "def ChatCompletion(prompt, system_prompt=None):\n",
|
350 | 345 | " output = client.chat.completions.create(\n",
|
351 | 346 | " messages=[\n",
|
352 |
| - " {\n", |
353 |
| - " \"role\": \"system\",\n", |
354 |
| - " \"content\": system_prompt\n", |
355 |
| - " },\n", |
356 |
| - " {\n", |
357 |
| - " \"role\": \"user\",\n", |
358 |
| - " \"content\": prompt\n", |
359 |
| - " }\n", |
| 347 | + " {\"role\": \"system\", \"content\": system_prompt},\n", |
| 348 | + " {\"role\": \"user\", \"content\": prompt}\n", |
360 | 349 | " ],\n",
|
361 |
| - " model=\"llama-2-13b-chat-fp16\",\n", |
| 350 | + " model=llama3_8b,\n", |
362 | 351 | " max_tokens=1000\n",
|
363 | 352 | " )\n",
|
364 | 353 | " return output.choices[0].message.content"
|
|
483 | 472 | "\n",
|
484 | 473 | "* User Prompts\n",
|
485 | 474 | "* Input Safety\n",
|
486 |
| - "* Llama 2\n", |
| 475 | + "* Llama 3\n", |
487 | 476 | "* Output Safety\n",
|
488 | 477 | "\n",
|
489 | 478 | "* Memory & Context"
|
|
743 | 732 | "### **4.3 - Retrieval Augmented Generation (RAG)**\n",
|
744 | 733 | "* Prompt Eng Limitations - Knowledge cutoff & lack of specialized data\n",
|
745 | 734 | "\n",
|
746 |
| - "* Retrieval Augmented Generation(RAG) allows us to retrieve snippets of information from external data sources and augment it to the user's prompt to get tailored responses from Llama 2.\n", |
747 |
| - "\n", |
748 |
| - "For our demo, we are going to download an external PDF file from a URL and query against the content in the pdf file to get contextually relevant information back with the help of Llama!\n", |
| 735 | + "* Retrieval Augmented Generation(RAG) allows us to retrieve snippets of information from external data sources and augment it to the user's prompt to get tailored responses from Llama 3.\n", |
749 | 736 | "\n",
|
750 |
| - "\n", |
751 |
| - "\n" |
| 737 | + "For our demo, we are going to download an external PDF file from a URL and query against the content in the pdf file to get contextually relevant information back with the help of Llama!" |
752 | 738 | ]
|
753 | 739 | },
|
754 | 740 | {
|
|
797 | 783 | "source": [
|
798 | 784 | "# langchain setup\n",
|
799 | 785 | "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
|
800 |
| - "# Use the Llama 2 model hosted on OctoAI\n", |
801 |
| - "# Temperature: Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value\n", |
| 786 | + "\n", |
| 787 | + "# Use the Llama 3 model hosted on OctoAI\n", |
| 788 | + "# max_tokens: Maximum number of tokens to generate. A word is generally 2-3 tokens\n", |
| 789 | + "# temperature: Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value\n", |
802 | 790 | "# top_p: When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens\n",
|
803 |
| - "# max_new_tokens: Maximum number of tokens to generate. A word is generally 2-3 tokens\n", |
804 | 791 | "llama_model = OctoAIEndpoint(\n",
|
805 |
| - " endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n", |
806 |
| - " model_kwargs={\n", |
807 |
| - " \"model\": llama2_13b,\n", |
808 |
| - " \"messages\": [\n", |
809 |
| - " {\n", |
810 |
| - " \"role\": \"system\",\n", |
811 |
| - " \"content\": \"You are a helpful, respectful and honest assistant.\"\n", |
812 |
| - " }\n", |
813 |
| - " ],\n", |
814 |
| - " \"max_tokens\": 1000,\n", |
815 |
| - " \"top_p\": 1,\n", |
816 |
| - " \"temperature\": 0.75\n", |
817 |
| - " },\n", |
| 792 | + " model=llama3_8b,\n", |
| 793 | + " max_tokens=1000,\n", |
| 794 | + " temperature=0.75,\n", |
| 795 | + " top_p=1\n", |
818 | 796 | ")"
|
819 | 797 | ]
|
820 | 798 | },
|
|
973 | 951 | },
|
974 | 952 | "source": [
|
975 | 953 | "#### **Resources**\n",
|
976 |
| - "- [GitHub - Llama 2](https://github.com/facebookresearch/llama)\n", |
977 |
| - "- [Github - LLama 2 Recipes](https://github.com/facebookresearch/llama-recipes)\n", |
978 |
| - "- [Llama 2](https://ai.meta.com/llama/)\n", |
979 |
| - "- [Research Paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n", |
| 954 | + "- [GitHub - Llama](https://github.com/facebookresearch/llama)\n", |
| 955 | + "- [Github - LLama Recipes](https://github.com/facebookresearch/llama-recipes)\n", |
| 956 | + "- [Llama](https://ai.meta.com/llama/)\n", |
| 957 | + "- [Research Paper on Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n", |
| 958 | + "- [Llama 3 Page](https://ai.meta.com/blog/meta-llama-3/)\n", |
980 | 959 | "- [Model Card](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md)\n",
|
981 | 960 | "- [Responsible Use Guide](https://ai.meta.com/llama/responsible-use-guide/)\n",
|
982 | 961 | "- [Acceptable Use Policy](https://ai.meta.com/llama/use-policy/)\n",
|
|
992 | 971 | "source": [
|
993 | 972 | "#### **Authors & Contact**\n",
|
994 | 973 | " * [email protected], [Amit Sangani | LinkedIn](https://www.linkedin.com/in/amitsangani/)\n",
|
995 |
| - " * [email protected], [Mohsen Agsen | LinkedIn](https://www.linkedin.com/in/mohsen-agsen-62a9791/)\n", |
| 974 | + " * [email protected], [Mohsen Agsen | LinkedIn](https://www.linkedin.com/in/dr-thierry-moreau/)\n", |
996 | 975 | "\n",
|
997 |
| - "Adapted to run on OctoAI by Thierry Moreau - [email protected]" |
| 976 | + "Adapted to run on OctoAI and use Llama 3 by [email protected] [Thierry Moreay | LinkedIn]()" |
998 | 977 | ]
|
999 | 978 | }
|
1000 | 979 | ],
|
|
0 commit comments