Update Hyper Parameters Llama 3.2 Fine-tuning (#4764)

windson · web-flow · commit 8c2394134d34 · 2024-10-09T14:24:24.000-04:00
* llama 3.2 fine-tuning

* update hyperparameters

* update hyperparameters

* update hyperparameters

* update hyperparameters
diff --git a/    generative_ai/sm-jumpstart_foundation_llama_3_2_3b_finetuning.ipynb b/    generative_ai/sm-jumpstart_foundation_llama_3_2_3b_finetuning.ipynb
@@ -695,7 +695,7 @@
     "- **instruction_tuned** - Whether to instruction-train the model or not. At most one of `instruction_tuned` and `chat_dataset` can be `True`. Must be `True` or `False`. Default is `False`.\n",
     "- **chat_dataset** - If `True`, dataset is assumed to be in chat format. At most one of `instruction_tuned` and `chat_dataset` can be `True`. Default is `False`.\n",
     "- **add_input_output_demarcation_key** - For an instruction tuned dataset, if this is `True`, a demarcation key (\"### Response:\\n\") is added between the prompt and completion before training. Default is `True`.\n",
-    "- **per_device_train_batch_size** - The batch size per GPU core/CPU for training. Default is 1.\n",
+    "- **per_device_train_batch_size** - The batch size per GPU core/CPU for training. Default is 4.\n",
     "- **per_device_eval_batch_size** - The batch size per GPU core/CPU for evaluation. Default is 1.\n",
     "- **max_train_samples** - For debugging purposes or quicker training, truncate the number of training examples to this value. Value -1 means using all of the training samples. Must be a positive integer or -1. Default is -1.\n",
     "- **max_val_samples** - For debugging purposes or quicker training, truncate the number of validation examples to this value. Value -1 means using all of the validation samples. Must be a positive integer or -1. Default is -1.\n",
diff --git a/    generative_ai/sm-jumpstart_foundation_llama_3_finetuning.ipynb b/    generative_ai/sm-jumpstart_foundation_llama_3_finetuning.ipynb
@@ -108,7 +108,7 @@
     "\n",
     "---\n",
     "\n",
-    "First we will deploy the Llama-2 model as a SageMaker endpoint. To train/deploy 8B and 70B models, please change model_id to \"meta-textgeneration-llama-3-8b\" and \"meta-textgeneration-llama-3-70b\" respectively.\n",
+    "First we will deploy the Llama-3 model as a SageMaker endpoint. To train/deploy 8B and 70B models, please change model_id to \"meta-textgeneration-llama-3-8b\" and \"meta-textgeneration-llama-3-70b\" respectively.\n",
     "\n",
     "---"
    ]
@@ -193,9 +193,7 @@
     "    },\n",
     "}\n",
     "try:\n",
-    "    response = pretrained_predictor.predict(\n",
-    "        payload, custom_attributes=\"accept_eula=false\"\n",
-    "    )\n",
+    "    response = pretrained_predictor.predict(payload, custom_attributes=\"accept_eula=false\")\n",
     "    print_response(payload, response)\n",
     "except Exception as e:\n",
     "    print(e)"
@@ -249,9 +247,7 @@
     "dolly_dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n",
     "\n",
     "# To train for question answering/information extraction, you can replace the assertion in next line to example[\"category\"] == \"closed_qa\"/\"information_extraction\".\n",
-    "summarization_dataset = dolly_dataset.filter(\n",
-    "    lambda example: example[\"category\"] == \"summarization\"\n",
-    ")\n",
+    "summarization_dataset = dolly_dataset.filter(lambda example: example[\"category\"] == \"summarization\")\n",
     "summarization_dataset = summarization_dataset.remove_columns(\"category\")\n",
     "\n",
     "# We split the dataset into two where test data is used to evaluate at the end.\n",
@@ -376,9 +372,7 @@
     "    instance_type=\"ml.g5.12xlarge\",  # For Llama-3-70b, add instance_type = \"ml.g5.48xlarge\"\n",
     ")\n",
     "# By default, instruction tuning is set to false. Thus, to use instruction tuning dataset you use\n",
-    "estimator.set_hyperparameters(\n",
-    "    instruction_tuned=\"True\", epoch=\"5\", max_input_length=\"1024\"\n",
-    ")\n",
+    "estimator.set_hyperparameters(instruction_tuned=\"True\", epoch=\"5\", max_input_length=\"1024\")\n",
     "estimator.fit({\"training\": train_data_location})"
    ]
   },