|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# Finetune HF Llama 3.1 70b and Deploy on AWS Bedrock\n", |
| 8 | + "\n", |
| 9 | + "This notebook has the following steps: \n", |
| 10 | + "\n", |
| 11 | + "1. imports and converts [Llama 3.1 70b](https://huggingface.co/meta-llama/Meta-Llama-3-8B) from Hugging Face transformer file format to .nemo file format\n", |
| 12 | + "\n", |
| 13 | + " Note: you will need to create a HuggingFace account and request access to the model\n", |
| 14 | + "\n", |
| 15 | + "2. Supervised Fine Tuning (SFT) using the NeMo framework on the [NVIDIA Daring-Anteater dataset](https://huggingface.co/datasets/nvidia/Daring-Anteater), a comprehensive dataset for instruction tuning\n", |
| 16 | + "\n", |
| 17 | + "3. Move your finetuned model to AWS S3 for use with AWS Bedrock Custom Model Import" |
| 18 | + ] |
| 19 | + }, |
| 20 | + { |
| 21 | + "cell_type": "markdown", |
| 22 | + "metadata": {}, |
| 23 | + "source": [ |
| 24 | + "## Convert Hugging Face Model to NeMo" |
| 25 | + ] |
| 26 | + }, |
| 27 | + { |
| 28 | + "cell_type": "code", |
| 29 | + "execution_count": null, |
| 30 | + "metadata": { |
| 31 | + "collapsed": true, |
| 32 | + "jupyter": { |
| 33 | + "outputs_hidden": true |
| 34 | + }, |
| 35 | + "scrolled": true |
| 36 | + }, |
| 37 | + "outputs": [], |
| 38 | + "source": [ |
| 39 | + "!pip install ipywidgets" |
| 40 | + ] |
| 41 | + }, |
| 42 | + { |
| 43 | + "cell_type": "code", |
| 44 | + "execution_count": null, |
| 45 | + "metadata": { |
| 46 | + "scrolled": true |
| 47 | + }, |
| 48 | + "outputs": [], |
| 49 | + "source": [ |
| 50 | + "import os\n", |
| 51 | + "import huggingface_hub\n", |
| 52 | + "\n", |
| 53 | + "# Set your Hugging Face access token\n", |
| 54 | + "huggingface_hub.login(\"<HF_TOKEN>\")\n", |
| 55 | + "os.makedirs(\"/demo-workspace/Meta-Llama-3.1-70B\", exist_ok=True)\n", |
| 56 | + "huggingface_hub.snapshot_download(\n", |
| 57 | + " repo_id=\"meta-llama/Llama-3.1-70B\", repo_type=\"model\", local_dir=\"Meta-Llama-3.1-70B\"\n", |
| 58 | + ")" |
| 59 | + ] |
| 60 | + }, |
| 61 | + { |
| 62 | + "cell_type": "code", |
| 63 | + "execution_count": null, |
| 64 | + "metadata": { |
| 65 | + "scrolled": true |
| 66 | + }, |
| 67 | + "outputs": [], |
| 68 | + "source": [ |
| 69 | + "%%bash\n", |
| 70 | + "# clear any previous temporary weights dir if any\n", |
| 71 | + "rm -r model_weights\n", |
| 72 | + "\n", |
| 73 | + "#converter script from NeMo\n", |
| 74 | + "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \\\n", |
| 75 | + " --precision bf16 \\\n", |
| 76 | + " --input_name_or_path=/demo-workspace/Meta-Llama-3.1-70B \\\n", |
| 77 | + " --output_path=/demo-workspace/Meta-Llama-3.1-70B.nemo \\\n", |
| 78 | + " --llama31 True" |
| 79 | + ] |
| 80 | + }, |
| 81 | + { |
| 82 | + "cell_type": "markdown", |
| 83 | + "metadata": {}, |
| 84 | + "source": [ |
| 85 | + "## Import and Configure Dataset" |
| 86 | + ] |
| 87 | + }, |
| 88 | + { |
| 89 | + "cell_type": "code", |
| 90 | + "execution_count": null, |
| 91 | + "metadata": {}, |
| 92 | + "outputs": [], |
| 93 | + "source": [ |
| 94 | + "%%bash\n", |
| 95 | + "\n", |
| 96 | + "mkdir /demo-workspace/datasets" |
| 97 | + ] |
| 98 | + }, |
| 99 | + { |
| 100 | + "cell_type": "code", |
| 101 | + "execution_count": null, |
| 102 | + "metadata": {}, |
| 103 | + "outputs": [], |
| 104 | + "source": [ |
| 105 | + "from datasets import load_dataset\n", |
| 106 | + "import json\n", |
| 107 | + "\n", |
| 108 | + "dataset = load_dataset(\"nvidia/daring-anteater\")\n", |
| 109 | + "\n", |
| 110 | + "for split, shard in dataset.items():\n", |
| 111 | + " length = len(shard)\n", |
| 112 | + " train_limit = length * 0.85\n", |
| 113 | + " with open(\"/demo-workspace/datasets/daring-anteater-train.jsonl\", \"w\") as train:\n", |
| 114 | + " with open(\"/demo-workspace/datasets/daring-anteater-val.jsonl\", \"w\") as val:\n", |
| 115 | + " for count, line in enumerate(shard):\n", |
| 116 | + " desired_data = {\n", |
| 117 | + " \"system\": line[\"system\"],\n", |
| 118 | + " \"conversations\": line[\"conversations\"],\n", |
| 119 | + " \"mask\": line[\"mask\"],\n", |
| 120 | + " \"type\": \"TEXT_TO_VALUE\",\n", |
| 121 | + " }\n", |
| 122 | + " if count < train_limit:\n", |
| 123 | + " json.dump(desired_data, train)\n", |
| 124 | + " train.write(\"\\n\")\n", |
| 125 | + " else:\n", |
| 126 | + " json.dump(desired_data, val)\n", |
| 127 | + " val.write(\"\\n\")" |
| 128 | + ] |
| 129 | + }, |
| 130 | + { |
| 131 | + "cell_type": "markdown", |
| 132 | + "metadata": {}, |
| 133 | + "source": [ |
| 134 | + "## Finetuning" |
| 135 | + ] |
| 136 | + }, |
| 137 | + { |
| 138 | + "cell_type": "code", |
| 139 | + "execution_count": null, |
| 140 | + "metadata": {}, |
| 141 | + "outputs": [], |
| 142 | + "source": [ |
| 143 | + "%%bash\n", |
| 144 | + "\n", |
| 145 | + "chmod +x /demo-workspace/sft-finetune-llama3.1-70b.sh\n", |
| 146 | + "ls -l /demo-workspace/sft-finetune-llama3.1-70b.sh" |
| 147 | + ] |
| 148 | + }, |
| 149 | + { |
| 150 | + "cell_type": "code", |
| 151 | + "execution_count": null, |
| 152 | + "metadata": {}, |
| 153 | + "outputs": [], |
| 154 | + "source": [ |
| 155 | + "import nemo_run as run\n", |
| 156 | + "\n", |
| 157 | + "\n", |
| 158 | + "def dgxc_executor(nodes: int = 1, devices: int = 1) -> run.DGXCloudExecutor:\n", |
| 159 | + " pvcs = [\n", |
| 160 | + " {\n", |
| 161 | + " \"name\": \"workspace\", # Default name to identify the PVC\n", |
| 162 | + " \"path\": \"/demo-workspace\", # Directory where PVC will be mounted in pods\n", |
| 163 | + " \"existingPvc\": True, # The PVC already exists\n", |
| 164 | + " \"claimName\": \"llama-3-1-70b-pvc-project-ax4ia\", # Replace with the name of the PVC to use\n", |
| 165 | + " }\n", |
| 166 | + " ]\n", |
| 167 | + "\n", |
| 168 | + " return run.DGXCloudExecutor(\n", |
| 169 | + " base_url=\"https://tme-aws.nv.run.ai/api/v1\", # Base URL to send API requests\n", |
| 170 | + " app_id=\"aws-app\", # Name of the Application\n", |
| 171 | + " app_secret=\"<APP_SECRET>\", # Application secret token\n", |
| 172 | + " project_name=\"aws-demo-project\", # Name of the project within Run:ai\n", |
| 173 | + " nodes=nodes, # Number of nodes to run on\n", |
| 174 | + " gpus_per_node=devices, # Number of processes per node to use\n", |
| 175 | + " container_image=\"nvcr.io/nvidia/nemo:25.02\", # Which container to deploy\n", |
| 176 | + " pvcs=pvcs, # Attach the PVC(s) to the pod\n", |
| 177 | + " launcher=\"torchrun\", # Use torchrun to launch the processes\n", |
| 178 | + " env_vars={\n", |
| 179 | + " \"PYTHONPATH\": \"/demo-workspace/nemo-run:$PYTHONPATH\", # Add the NeMo-Run directory to the PYTHONPATH\n", |
| 180 | + " \"HF_TOKEN\": \"<HF_TOKEN>\", # Add your Hugging Face API token here\n", |
| 181 | + " \"FI_EFA_USE_HUGE_PAGE\": \"0\",\n", |
| 182 | + " \"TORCH_HOME\": \"/demo-workspace/.cache\",\n", |
| 183 | + " \"NEMORUN_HOME\": \"/demo-workspace/nemo-run\",\n", |
| 184 | + " \"OMP_NUM_THREADS\": \"1\",\n", |
| 185 | + " },\n", |
| 186 | + " )" |
| 187 | + ] |
| 188 | + }, |
| 189 | + { |
| 190 | + "cell_type": "code", |
| 191 | + "execution_count": null, |
| 192 | + "metadata": { |
| 193 | + "scrolled": true |
| 194 | + }, |
| 195 | + "outputs": [], |
| 196 | + "source": [ |
| 197 | + "executor = dgxc_executor(nodes=4, devices=8)\n", |
| 198 | + "run.config.set_nemorun_home(\"/demo-workspace/nemo-run\")\n", |
| 199 | + "\n", |
| 200 | + "with run.Experiment(\"sft-finetuning\") as exp:\n", |
| 201 | + " exp.add(run.Script(\"/demo-workspace/sft-finetune-llama3.1-70b.sh\"), executor=executor)\n", |
| 202 | + "\n", |
| 203 | + " # Launch the experiment on the cluster\n", |
| 204 | + " exp.run(sequential=True)" |
| 205 | + ] |
| 206 | + }, |
| 207 | + { |
| 208 | + "cell_type": "markdown", |
| 209 | + "metadata": {}, |
| 210 | + "source": [ |
| 211 | + "## Import Model to AWS S3\n", |
| 212 | + "\n", |
| 213 | + "To prepare the model for use with BedRock, we must first convert our finetuned model weights back to HF safetensors. The model and the original llama 3.0 tokens will then be sent to your S3 bucket. " |
| 214 | + ] |
| 215 | + }, |
| 216 | + { |
| 217 | + "cell_type": "code", |
| 218 | + "execution_count": null, |
| 219 | + "metadata": { |
| 220 | + "scrolled": true |
| 221 | + }, |
| 222 | + "outputs": [], |
| 223 | + "source": [ |
| 224 | + "%%bash\n", |
| 225 | + "\n", |
| 226 | + "python /opt/NeMo/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py \\\n", |
| 227 | + "--input_name_or_path /demo-workspace/llama3.1-70b-daring-anteater-sft/checkpoints/megatron_gpt_sft.nemo \\\n", |
| 228 | + "--output_path /demo-workspace/llama-output-weights.bin \\\n", |
| 229 | + "--hf_input_path /demo-workspace/Meta-Llama-3.1-70B \\\n", |
| 230 | + "--hf_output_path /demo-workspace/sft-llama-3.1-hf" |
| 231 | + ] |
| 232 | + }, |
| 233 | + { |
| 234 | + "cell_type": "code", |
| 235 | + "execution_count": null, |
| 236 | + "metadata": { |
| 237 | + "vscode": { |
| 238 | + "languageId": "plaintext" |
| 239 | + } |
| 240 | + }, |
| 241 | + "outputs": [], |
| 242 | + "source": [ |
| 243 | + "%%bash\n", |
| 244 | + "\n", |
| 245 | + "export BUCKET_NAME=hf-llama3-1-70b\n", |
| 246 | + "\n", |
| 247 | + "export AWS_ACCESS_KEY_ID=<AWS_ACCESS_KEY_ID>\n", |
| 248 | + "export AWS_SECRET_ACCESS_KEY=<AWS_SECRET_ACCESS_KEY>\n", |
| 249 | + "./s5cmd cp /demo-workspace/sft-llama-3.1-hf s3://$BUCKET_NAME\n", |
| 250 | + "\n", |
| 251 | + "./s5cmd cp /demo-workspace/Meta-Llama-3.1-70B/tokenizer.json s3://$BUCKET_NAME/sft-llama-3.1-hf/\n", |
| 252 | + "./s5cmd cp /demo-workspace/Meta-Llama-3.1-70B/tokenizer_config.json s3://$BUCKET_NAME/sft-llama-3.1-hf/\n", |
| 253 | + "./s5cmd cp /demo-workspace/Meta-Llama-3.1-70B/original/tokenizer.model s3://$BUCKET_NAME/sft-llama-3.1-hf/" |
| 254 | + ] |
| 255 | + }, |
| 256 | + { |
| 257 | + "cell_type": "markdown", |
| 258 | + "metadata": {}, |
| 259 | + "source": [ |
| 260 | + "To run with BedRock, go to the Custom Model import feature and load your model from your S3 bucket. Once the model is ready, it can directly be used for your production inference. " |
| 261 | + ] |
| 262 | + } |
| 263 | + ], |
| 264 | + "metadata": { |
| 265 | + "kernelspec": { |
| 266 | + "display_name": "Python 3 (ipykernel)", |
| 267 | + "language": "python", |
| 268 | + "name": "python3" |
| 269 | + }, |
| 270 | + "language_info": { |
| 271 | + "codemirror_mode": { |
| 272 | + "name": "ipython", |
| 273 | + "version": 3 |
| 274 | + }, |
| 275 | + "file_extension": ".py", |
| 276 | + "mimetype": "text/x-python", |
| 277 | + "name": "python", |
| 278 | + "nbconvert_exporter": "python", |
| 279 | + "pygments_lexer": "ipython3", |
| 280 | + "version": "3.12.3" |
| 281 | + } |
| 282 | + }, |
| 283 | + "nbformat": 4, |
| 284 | + "nbformat_minor": 4 |
| 285 | +} |
0 commit comments