nerdy-tech-com-gitub
diff --git a/‎.github/scripts/spellcheck_conf/wordlist.txt
Lines changed: 3 additions & 1 deletion b/‎.github/scripts/spellcheck_conf/wordlist.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 15 additions & 9 deletions b/‎README.md
Lines changed: 15 additions & 9 deletions
diff --git a/‎recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
Lines changed: 9 additions & 9 deletions b/‎recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
Lines changed: 9 additions & 9 deletions
diff --git a/‎recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
Lines changed: 1 addition & 1 deletion b/‎recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎recipes/3p_integrations/llama_on_prem.md
Lines changed: 9 additions & 9 deletions b/‎recipes/3p_integrations/llama_on_prem.md
Lines changed: 9 additions & 9 deletions
diff --git a/‎recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
Lines changed: 2 additions & 2 deletions b/‎recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
Lines changed: 1 addition & 1 deletion b/‎recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎recipes/responsible_ai/README.md
Lines changed: 2 additions & 2 deletions b/‎recipes/responsible_ai/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎recipes/responsible_ai/llama_guard/README.md
Lines changed: 6 additions & 3 deletions b/‎recipes/responsible_ai/llama_guard/README.md
Lines changed: 6 additions & 3 deletions
diff --git a/‎recipes/responsible_ai/llama_guard/inference.py
Lines changed: 2 additions & 2 deletions b/‎recipes/responsible_ai/llama_guard/inference.py
Lines changed: 2 additions & 2 deletions
@@ -1412,4 +1412,6 @@ QLoRA
 ntasks
 srun
 xH
-unquantized
+unquantized
+eom
+ipython
@@ -4,29 +4,35 @@ The 'llama-recipes' repository is a companion to the [Meta Llama 3](https://gith
 
 <!-- markdown-link-check-enable -->
 > [!IMPORTANT]
-> Meta Llama 3 has a new prompt template and special tokens (based on the tiktoken tokenizer).
+> Meta Llama 3.1 has a new prompt template and special tokens.
 > | Token | Description |
 > |---|---|
-> `<\|begin_of_text\|>` | This is equivalent to the BOS token. |
-> `<\|end_of_text\|>` | This is equivalent to the EOS token. For multiturn-conversations it's usually unused. Instead, every message is terminated with `<\|eot_id\|>` instead.|
-> `<\|eot_id\|>` | This token signifies the end of the message in a turn i.e. the end of a single message by a system, user or assistant role as shown below.|
-> `<\|start_header_id\|>{role}<\|end_header_id\|>` | These tokens enclose the role for a particular message. The possible roles can be: system, user, assistant. |
+> `<\|begin_of_text\|>` | Specifies the start of the prompt. |
+> `<\|eot_id\|>` | This token signifies the end of a turn i.e. the end of the model's interaction either with the user or tool executor. |
+> `<\|eom_id\|>` | End of Message. A message represents a possible stopping point where the model can inform the execution environment that a tool call needs to be made. |
+> `<\|python_tag\|>` | A special tag used in the model’s response to signify a tool call. |
+> `<\|finetune_right_pad_id\|>` | Used for padding text sequences in a batch to the same length. |
+> `<\|start_header_id\|>{role}<\|end_header_id\|>` | These tokens enclose the role for a particular message. The possible roles can be: system, user, assistant and ipython. |
+> `<\|end_of_text\|>` | This is equivalent to the EOS token. For multiturn-conversations it's usually unused, this token is expected to be generated only by the base models. |
 >
-> A multiturn-conversation with Meta Llama 3 follows this prompt template:
+> A multiturn-conversation with Meta Llama 3.1 that includes tool-calling follows this structure:
 > ```
 > <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 >
 > {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
 >
 > {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 >
-> {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+> <|python_tag|>{{ model_tool_call_1 }}<|eom_id|><|start_header_id|>ipython<|end_header_id|>
 >
-> {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+> {{ tool_response }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+>
+> {{model_response_based_on_tool_response}}<|eot_id|>
 > ```
 > Each message gets trailed by an `<|eot_id|>` token before a new header is started, signaling a role change.
 >
-> More details on the new tokenizer and prompt template can be found [here](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3#special-tokens-used-with-meta-llama-3).
+> More details on the new tokenizer and prompt template can be found [here](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1). 
+
 >
 > [!NOTE]
 > The llama-recipes repository was recently refactored to promote a better developer experience of using the examples. Some files have been moved to new locations. The `src/` folder has NOT been modified, so the functionality of this repo and package is not impacted.
 
@@ -145,7 +145,7 @@
     "class Args:\n",
     "    def __init__(self, \n",
     "                 max_examples=100, \n",
-    "                 sql_model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\", \n",
+    "                 sql_model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", \n",
     "                 gold_file_name=\"gold-test-set.jsonl\",\n",
     "                 training_file_name=\"generated_queries.jsonl\",\n",
     "                 num_to_generate=10):\n",
@@ -197,7 +197,7 @@
     }
    ],
    "source": [
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "question = \"\"\"Who is the highest paid NBA player?\"\"\"\n",
     "system = f\"\"\"You are an NBA analyst with 15 years of experience writing complex SQL queries. Consider the nba_roster table with the following schema:\n",
@@ -418,7 +418,7 @@
     "class ScoreStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=150,\n",
     "        )\n",
     "\n",
@@ -712,7 +712,7 @@
     "class ModelStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=300,\n",
     "        )\n",
     "\n",
@@ -808,7 +808,7 @@
     "class QuestionStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=150,\n",
     "        )\n",
     "\n",
@@ -1055,7 +1055,7 @@
    ],
    "source": [
     "args = Args()\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1601,7 +1601,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"archive/generated_queries_large_filtered_cleaned.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1798,7 +1798,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"generated_queries_v2.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1966,7 +1966,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"archive/generated_queries_v2_large_filtered_cleaned.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
 
@@ -16,7 +16,7 @@ def parse_arguments():
     parser.add_argument(
         "--sql-model-name",
         type=str,
-        default="meta-llama/Meta-Llama-3-8B-Instruct",
+        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
         help="The model to use for text2sql",
         required=False,
     )
 
@@ -8,7 +8,7 @@ We'll use the Amazon EC2 instance running Ubuntu with an A10G 24GB GPU as an exa
 
 The Colab notebook to connect via LangChain with Llama 3 hosted as the vLLM and TGI API services is [here](https://colab.research.google.com/drive/1rYWLdgTGIU1yCHmRpAOB2D-84fPzmOJg), also shown in the sections below.
 
-This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
+This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
 
 You'll also need your Hugging Face access token which you can get at your Settings page [here](https://huggingface.co/settings/tokens).
 
@@ -33,7 +33,7 @@ There are two ways to deploy Llama 3 via vLLM, as a general API server or an Ope
 Run the command below to deploy vLLM as a general Llama 3 service:
 
 ```
-python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 Then on another terminal you can run:
@@ -68,13 +68,13 @@ Also, if you have multiple GPUs, you can add the `--tensor-parallel-size` argume
 git clone https://github.com/vllm-project/vllm
 cd vllm/vllm/entrypoints
 conda activate llama3
-python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct --tensor-parallel-size 4
+python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct --tensor-parallel-size 4
 ```
 
 With multiple GPUs, you can also run replica of models as long as your model size can fit into targeted GPU memory. For example, if you have two A10G with 24 GB memory, you can run two Llama 3 8B models at the same time. This can be done by launching two api servers each targeting specific CUDA cores on different ports:
-`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3-8B-Instruct`
+`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
 and
-`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3-8B-Instruct`
+`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
 The benefit would be that you can balance incoming requests to both models, reaching higher batch size processing for a trade-off of generation latency.
 
 
@@ -83,14 +83,14 @@ The benefit would be that you can balance incoming requests to both models, reac
 You can also deploy the vLLM hosted Llama 3 as an OpenAI-Compatible service to easily replace code using OpenAI API. First, run the command below:
 
 ```
-python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 Then on another terminal, run:
 
 ```
 curl http://localhost:5000/v1/completions -H "Content-Type: application/json" -d '{
-        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "prompt": "Who wrote the book Innovators dilemma?",
         "max_tokens": 300,
         "temperature": 0
@@ -118,7 +118,7 @@ from langchain.llms import VLLMOpenAI
 llm = VLLMOpenAI(
     openai_api_key="EMPTY",
     openai_api_base="http://<vllm_server_ip_address>:5000/v1",
-    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
 )
 
 print(llm("Who wrote the book godfather?"))
@@ -136,7 +136,7 @@ You can now use the Llama 3 instance `llm` created this way in any of the demo a
 The easiest way to deploy Llama 3 with TGI is using its official docker image. First, replace `<your_hugging_face_access_token>` and set the three required shell variables (you may replace the `model` value above with another Llama 3 model):
 
 ```
-model=meta-llama/Meta-Llama-3-8B-Instruct
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data
 token=<your_hugging_face_access_token>
 ```
 
@@ -92,7 +92,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
+    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3.1-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
     "\n",
     "1. Accept Terms of Service for Meta Llama 3 on Meta [website](https://llama.meta.com/llama-downloads).\n",
     "2. Use the same email address from Step (1) to login into Hugging Face.\n",
@@ -125,7 +125,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
+    "model = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
     "tokenizer = AutoTokenizer.from_pretrained(model)"
    ]
   },
 
@@ -90,7 +90,7 @@
     "from llama_recipes.configs import train_config as TRAIN_CONFIG\n",
     "\n",
     "train_config = TRAIN_CONFIG()\n",
-    "train_config.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
+    "train_config.model_name = \"meta-llama/Meta-Llama-3.1-8B\"\n",
     "train_config.num_epochs = 1\n",
     "train_config.run_validation = False\n",
     "train_config.gradient_accumulation_steps = 4\n",
 
@@ -1,8 +1,8 @@
 # Meta Llama Guard
 
-Meta Llama Guard and Meta Llama Guard 2 are new models that provide input and output guardrails for LLM inference. For more details, please visit the main [repository](https://github.com/facebookresearch/PurpleLlama/tree/main/Llama-Guard2).
+Meta Llama Guard models provide input and output guardrails for LLM inference. For more details, please visit the main [repository](https://github.com/meta-llama/PurpleLlama/).
 
-**Note** Please find the right model on HF side [here](https://huggingface.co/meta-llama/Meta-Llama-Guard-2-8B).
+**Note** Please find the right model on HF side [here](https://huggingface.co/meta-llama/Llama-Guard-3-8B).
 
 ### Running locally
 The [llama_guard](llama_guard) folder contains the inference script to run Meta Llama Guard locally. Add test prompts directly to the [inference script](llama_guard/inference.py) before running it.
 
@@ -1,6 +1,6 @@
 # Meta Llama Guard demo
 <!-- markdown-link-check-disable -->
-Meta Llama Guard is a language model that provides input and output guardrails for LLM inference. For more details and model cards, please visit the main repository for each model, [Meta Llama Guard](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard) and Meta [Llama Guard 2](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard2).
+Meta Llama Guard is a language model that provides input and output guardrails for LLM inference. For more details and model cards, please visit the [PurpleLlama](https://github.com/meta-llama/PurpleLlama) repository.
 
 This folder contains an example file to run inference with a locally hosted model, either using the Hugging Face Hub or a local path.
 
@@ -55,9 +55,9 @@ This is the output:
 
 To run it with a local model, you can use the `model_id` param in the inference script:
 
-`python recipes/responsible_ai/llama_guard/inference.py --model_id=/home/ubuntu/models/llama3/llama_guard_2-hf/ --llama_guard_version=LLAMA_GUARD_2`
+`python recipes/responsible_ai/llama_guard/inference.py --model_id=/home/ubuntu/models/llama3/Llama-Guard-3-8B/ --llama_guard_version=LLAMA_GUARD_3`
 
-Note: Make sure to also add the llama_guard_version if when it does not match the default, the script allows you to run the prompt format from Meta Llama Guard 1 on Meta Llama Guard 2
+Note: Make sure to also add the llama_guard_version; by default it uses LLAMA_GUARD_3
 
 ## Inference Safety Checker
 When running the regular inference script with prompts, Meta Llama Guard will be used as a safety checker on the user prompt and the model output. If both are safe, the result will be shown, else a message with the error will be shown, with the word unsafe and a comma separated list of categories infringed. Meta Llama Guard is always loaded quantized using Hugging Face Transformers library with bitsandbytes.
@@ -67,3 +67,6 @@ In this case, the default categories are applied by the tokenizer, using the `ap
 Use this command for testing with a quantized Llama model, modifying the values accordingly:
 
 `python examples/inference.py --model_name <path_to_regular_llama_model> --prompt_file <path_to_prompt_file> --quantization 8bit --enable_llamaguard_content_safety`
+
+## Llama Guard 3 Finetuning & Customization
+The safety categories in Llama Guard 3 can be tuned for specific application needs. Existing categories can be removed and new categories can be added to the taxonomy. The [Llama Guard Customization](./llama_guard_customization_via_prompting_and_fine_tuning.ipynb) notebook walks through the process.
@@ -14,8 +14,8 @@ class AgentType(Enum):
     USER = "User"
 
 def main(
-    model_id: str = "meta-llama/LlamaGuard-7b",
-    llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_1
+    model_id: str = "meta-llama/Llama-Guard-3-8B",
+    llama_guard_version: str = "LLAMA_GUARD_3"
 ):
     """
     Entry point for Llama Guard inference sample script.
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def parse_arguments():`
`16`	`16`	`parser.add_argument(`
`17`	`17`	`"--sql-model-name",`
`18`	`18`	`type=str,`
`19`		`- default="meta-llama/Meta-Llama-3-8B-Instruct",`
	`19`	`+ default="meta-llama/Meta-Llama-3.1-8B-Instruct",`
`20`	`20`	`help="The model to use for text2sql",`
`21`	`21`	`required=False,`
`22`	`22`	`)`