aws
diff --git a/‎ build_and_train_models/sm-distributed_model_parallel_v2/gpt-neox/sm-fsdp-tp_finetuning_gpt-neox.ipynb
Lines changed: 4 additions & 4 deletions b/‎ build_and_train_models/sm-distributed_model_parallel_v2/gpt-neox/sm-fsdp-tp_finetuning_gpt-neox.ipynb
Lines changed: 4 additions & 4 deletions
diff --git a/‎ build_and_train_models/sm-distributed_model_parallel_v2/gpt-neox/sm-fsdp-tp_train_gpt-neox.ipynb
Lines changed: 9 additions & 9 deletions b/‎ build_and_train_models/sm-distributed_model_parallel_v2/gpt-neox/sm-fsdp-tp_train_gpt-neox.ipynb
Lines changed: 9 additions & 9 deletions
diff --git a/‎ build_and_train_models/sm-distributed_model_parallel_v2/llama_v2/sm-fsdp-tp-cp_train_llama_v2.ipynb renamed to ‎ build_and_train_models/sm-distributed_model_parallel_v2/llama_v2_v3/sm-fsdp-tp-cp_train_llama_v2_v3.ipynb
Lines changed: 50 additions & 30 deletions b/‎ build_and_train_models/sm-distributed_model_parallel_v2/llama_v2/sm-fsdp-tp-cp_train_llama_v2.ipynb renamed to ‎ build_and_train_models/sm-distributed_model_parallel_v2/llama_v2_v3/sm-fsdp-tp-cp_train_llama_v2_v3.ipynb
Lines changed: 50 additions & 30 deletions
@@ -80,7 +80,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --upgrade \"sagemaker>=2.224\"\n",
+    "%pip install --upgrade \"sagemaker>=2.233\"\n",
     "%pip install sagemaker-experiments"
    ]
   },
@@ -711,9 +711,9 @@
    "outputs": [],
    "source": [
     "if use_fsx:\n",
-    "    hyperparameters[\n",
-    "        \"hf_pretrained_model_name_or_dir\"\n",
-    "    ] = PRETRAINED_MODEL  # f\"{SM_TRAIN_DIR}{PRETRAINED_DIR}\"\n",
+    "    hyperparameters[\"hf_pretrained_model_name_or_dir\"] = (\n",
+    "        PRETRAINED_MODEL  # f\"{SM_TRAIN_DIR}{PRETRAINED_DIR}\"\n",
+    "    )\n",
     "else:\n",
     "    hyperparameters[\"hf_pretrained_model_name_or_dir\"] = PRETRAINED_MODEL"
    ]
 
@@ -74,7 +74,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --upgrade \"sagemaker>=2.224\"\n",
+    "%pip install --upgrade \"sagemaker>=2.233\"\n",
     "%pip install sagemaker-experiments"
    ]
   },
@@ -675,12 +675,12 @@
     "    # If you want to resume training, set checkpoint_dir to the same path as a previous job.\n",
     "    SM_TRAIN_DIR = \"/opt/ml/input/data/train\"\n",
     "    hyperparameters[\"checkpoint_dir\"] = f\"{SM_TRAIN_DIR}/smp-v2/{model_type}/checkpointdir\"\n",
-    "    hyperparameters[\n",
-    "        \"training_dir\"\n",
-    "    ] = f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/train_synthetic\"\n",
-    "    hyperparameters[\n",
-    "        \"test_dir\"\n",
-    "    ] = f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/val_synthetic\"\n",
+    "    hyperparameters[\"training_dir\"] = (\n",
+    "        f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/train_synthetic\"\n",
+    "    )\n",
+    "    hyperparameters[\"test_dir\"] = (\n",
+    "        f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/val_synthetic\"\n",
+    "    )\n",
     "\n",
     "# The checkpoint path (hyperparameters['checkpoint_dir'] or checkpoint_s3_uri) is not unique per job.\n",
     "# You need to modify as needed for different runs.\n",
@@ -874,7 +874,7 @@
     "        },\n",
     "    },\n",
     "    py_version=\"py311\",\n",
-    "    framework_version=\"2.3.1\",\n",
+    "    framework_version=\"2.4.1\",\n",
     "    # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "    output_path=s3_output_bucket,\n",
     "    max_run=86400,\n",
@@ -956,7 +956,7 @@
     "            },\n",
     "        },\n",
     "        py_version=\"py311\",\n",
-    "        framework_version=\"2.3.1\",\n",
+    "        framework_version=\"2.4.1\",\n",
     "        # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "        output_path=s3_output_bucket,\n",
     "        max_run=86400,\n",
 
@@ -6,16 +6,16 @@
     "tags": []
    },
    "source": [
-    "# Enable Long Context Length Llama-v2 (or GPT-NeoX) training with Context Parallelism.\n",
+    "# Enable Long Context Length Llama-v2/v3 (or GPT-NeoX) training with Context Parallelism.\n",
     "---\n",
     "\n",
     "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook.\n",
     "\n",
-    "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
     "---\n",
     "\n",
-    "In this notebook, you will learn how to enable long context length distributed training of the Hugging Face Transformers Llama-v2 and GPT-NeoX models.\n",
+    "In this notebook, you will learn how to enable long context length distributed training of the Hugging Face Transformers Llama-v2/v3 and GPT-NeoX models.\n",
     "\n",
     "You can either launch this notebook from an Amazon SageMaker notebook instance which handles all credentials automatically,\n",
     "or by running it locally and setting credentials manually.\n",
@@ -74,7 +74,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --upgrade \"sagemaker>=2.224\"\n",
+    "%pip install --upgrade \"sagemaker>=2.233\"\n",
     "%pip install sagemaker-experiments"
    ]
   },
@@ -187,7 +187,7 @@
    "source": [
     "### Choose Model\n",
     "\n",
-    "Choose to train either the `GPT-NeoX` or `Llama-v2` model."
+    "Choose to train either the `GPT-NeoX`, `Llama-v2`, or `Llama-v3` model."
    ]
   },
   {
@@ -196,7 +196,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_type = \"llama_v2\"  # [\"gpt_neox\", \"llama_v2\"]"
+    "model_type = \"llama_v2\"  # [\"gpt_neox\", \"llama_v2\", \"llama_v3\"]"
    ]
   },
   {
@@ -477,7 +477,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s3_output_bucket = f\"s3://sagemaker-{region}-{account}/smp-fsdp-tp/{model_type}-outputdir/\""
+    "s3_output_bucket = f\"s3://sagemaker-{region}-{account}/smp-fsdp-tp/{}-outputdir/\""
    ]
   },
   {
@@ -640,12 +640,12 @@
     "    # If you want to resume training, set checkpoint_dir to the same path as a previous job.\n",
     "    SM_TRAIN_DIR = \"/opt/ml/input/data/train\"\n",
     "    hyperparameters[\"checkpoint_dir\"] = f\"{SM_TRAIN_DIR}/smp-v2/{model_type}/checkpointdir\"\n",
-    "    hyperparameters[\n",
-    "        \"training_dir\"\n",
-    "    ] = f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/train_synthetic\"\n",
-    "    hyperparameters[\n",
-    "        \"test_dir\"\n",
-    "    ] = f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/val_synthetic\"\n",
+    "    hyperparameters[\"training_dir\"] = (\n",
+    "        f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/train_synthetic\"\n",
+    "    )\n",
+    "    hyperparameters[\"test_dir\"] = (\n",
+    "        f\"{SM_TRAIN_DIR}/datasets/pytorch-gpt2-data/pytorch_gpt2/val_synthetic\"\n",
+    "    )\n",
     "\n",
     "# The checkpoint path (hyperparameters['checkpoint_dir'] or checkpoint_s3_uri) is not unique per job.\n",
     "# You need to modify as needed for different runs.\n",
@@ -698,6 +698,26 @@
     "            \"num_layers\": 80,\n",
     "        },\n",
     "    },\n",
+    "    \"llama_v3\": {\n",
+    "        8: {\n",
+    "            \"hidden_width\": 4096,\n",
+    "            \"llama_intermediate_size\": 14336,\n",
+    "            \"max_context_width\": 2048,\n",
+    "            \"num_heads\": 32,\n",
+    "            \"num_layers\": 32,\n",
+    "            \"rotary_emb_base\": 500000,\n",
+    "            \"vocab_size\": 128256,\n",
+    "        },\n",
+    "        70: {\n",
+    "            \"hidden_width\": 8192,\n",
+    "            \"llama_intermediate_size\": 28672,\n",
+    "            \"max_context_width\": 2048,\n",
+    "            \"num_heads\": 64,\n",
+    "            \"num_layers\": 80,\n",
+    "            \"rotary_emb_base\": 500000,\n",
+    "            \"vocab_size\": 128256,\n",
+    "        },\n",
+    "    },\n",
     "}\n",
     "\n",
     "model_params = model_configs.get(model_type, {}).get(model_size)\n",
@@ -840,7 +860,7 @@
     "        },\n",
     "    },\n",
     "    py_version=\"py311\",\n",
-    "    framework_version=\"2.3.1\",\n",
+    "    framework_version=\"2.4.1\",\n",
     "    # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "    output_path=s3_output_bucket,\n",
     "    max_run=86400,\n",
@@ -923,7 +943,7 @@
     "            },\n",
     "        },\n",
     "        py_version=\"py311\",\n",
-    "        framework_version=\"2.3.1\",\n",
+    "        framework_version=\"2.4.1\",\n",
     "        # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "        output_path=s3_output_bucket,\n",
     "        max_run=86400,\n",
@@ -976,35 +996,35 @@
     "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n",
     "\n",
     "\n",
-    "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n",
+    "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n",
     "\n",
-    "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2|sm-fsdp-tp-cp_train_llama_v2.ipynb)\n"
+    "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/build_and_train_models|sm-distributed_model_parallel_v2|llama_v2_v3|sm-fsdp-tp-cp_train_llama_v2_v3.ipynb)\n"
    ]
   },
   {