Merge pull request #182 from tnixon/a10_v100_config

srowen · web-flow · commit a33d774e857e · 2023-06-02T12:14:42.000-05:00
A10 &amp; v100 config
diff --git a/README.md b/README.md
@@ -94,40 +94,47 @@ Otherwise, follow the steps above. The 12B param model may not function well in
 ### Training on Other Instances
 
 A100 instance types are not available in all cloud regions, or can be hard to provision. Training is possible on other GPU instance types, 
-for smaller Dolly model sizes, and with small modifications to reduce memory usage.
-These modifications are not optimal, but are simple to make.
+for smaller Dolly model sizes, and with small modifications to reduce memory usage. These modifications are not optimal, but are simple to make. 
+
+Select your GPU family type from the `gpu_family` widget, enter the number of GPUs available in the `num_gpus` widget, and then run the rest of the code. 
+A number of different options will be set for you to train the model for one of the following GPU types:
+- A100 (default)
+- A10 
+- V100
+
+Details of the different configurations are below.
+
+#### A100 GPUs
+
+A100 GPUs are preferred for training all model sizes, and are the only GPUs that can train the 12B param model in a reasonable amount of time.
+As such, this is the default configuration, as set in the `a100_config.json` deepspeed config file.
 
 #### A10 GPUs
 
 Training the 12B param model is not recommended on A10s.
 
-To train the 6.9B param model on A10 instances (ex: `g5.24xlarge`, 4 x A10 24GB; `Standard_NV72ads_A10_v5`, 2 x A10), make the following changes:
+To train the 6.9B param model on A10 instances (ex: `g5.24xlarge`, 4 x A10 24GB; `Standard_NV72ads_A10_v5`, 2 x A10),
+simply select `a10` from the `gpu_family` widget and enter the number of GPUs available in the `num_gpus` widget, then run the rest of the code. 
+This will use the `a10_config.json` deepspeed config file, which makes the following changes:
 
-- Set `per-device-train-batch-size` and `per-device-eval-batch-size` to 3 in the `train_dolly.py` invocation of `deepspeed`
-- Modify the deepspeed config file `ds_z3_bf16_config.json` to configure optimizer offload. Within the `"zero_optimization"` section, add:
+- `per-device-train-batch-size` and `per-device-eval-batch-size` are set to 3 in the `train_dolly.py` invocation of `deepspeed`
+- Within the `"zero_optimization"` section of the deepspeed config, we have added:
   ```
   "offload_optimizer": {
     "device": "cpu",
     "pin_memory": true
   },
   ```
-- Set the `num_gpus` widget in `train_dolly` to the number of GPUs in your instance, such as 2 or 4, before running
-
-To train the 2.8B param model:
-
-- Instead, only set `per-device-train-batch-size` and `per-device-eval-batch-size` to 3 in the `train_dolly.py` invocation of `deepspeed`
 
 #### V100 GPUs
 
-To run on V100 instances with 32GB of GPU memory (ex: `p3dn.24xlarge` or `Standard_ND40rs_v2`), follow instructions above, and add:
+To run on V100 instances with 32GB of GPU memory (ex: `p3dn.24xlarge` or `Standard_ND40rs_v2`), 
+simply select `v100` from the `gpu_family` widget and enter the number of GPUs available in the `num_gpus` widget, and then run the rest of the code. 
+This will use the `v100_config.json` deepspeed config file, which makes the following changes:
 
-- Modify `training/trainer.py` to disable `bf16` and enable `fp16` in `TrainingArguments`:
-  ```
-  ...
-  fp16=True,
-  bf16=False,
-  ...
-  ```
+- It makes the changes described above for A10s
+- It enables fp16 floating point format
+- It sets the `per-device-train-batch-size` and `per-device-eval-batch-size` to 3
   
 You may be able to slightly increase the batch size with 32GB instances, compared to what works above for 24GB A10s.
 
diff --git a/config/a100_config.json b/config/a100_config.json
diff --git a/config/a10_config.json b/config/a10_config.json
@@ -0,0 +1,44 @@
+{
+    "bf16": {
+      "enabled": "auto"
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": "auto",
+        "eps": "auto",
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupLR",
+      "params": {
+        "warmup_min_lr": "auto",
+        "warmup_max_lr": "auto",
+        "warmup_num_steps": "auto"
+      }
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1e9,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1e9,
+      "stage3_max_reuse_distance": 1e9,
+      "stage3_gather_16bit_weights_on_model_save": true,
+      "offload_optimizer": {
+        "device": "cpu",
+        "pin_memory": true
+      }
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  }
diff --git a/config/v100_config.json b/config/v100_config.json
@@ -0,0 +1,44 @@
+{
+    "fp16": {
+      "enabled": true
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": "auto",
+        "eps": "auto",
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupLR",
+      "params": {
+        "warmup_min_lr": "auto",
+        "warmup_max_lr": "auto",
+        "warmup_num_steps": "auto"
+      }
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1e9,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1e9,
+      "stage3_max_reuse_distance": 1e9,
+      "stage3_gather_16bit_weights_on_model_save": true,
+      "offload_optimizer": {
+        "device": "cpu",
+        "pin_memory": true
+      }
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  }
diff --git a/train_dolly.py b/train_dolly.py
@@ -50,10 +50,10 @@
 
 # COMMAND ----------
 
-#!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb -O /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
-#  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb -O /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
-#  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb -O /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
-#  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcurand-dev-11-7_10.2.10.91-1_amd64.deb -O /tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb && \
+#!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb -O /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
+#  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb -O /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
+#  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb -O /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
+#  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcurand-dev-11-7_10.2.10.91-1_amd64.deb -O /tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb && \
 #  dpkg -i /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
 #  dpkg -i /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
 #  dpkg -i /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
@@ -91,6 +91,7 @@
 dbutils.widgets.text("local_training_root", "", "local_training_root")
 dbutils.widgets.text("dbfs_output_root", "", "dbfs_output_root")
 dbutils.widgets.text("experiment_id", "", "experiment_id")
+dbutils.widgets.combobox("gpu_family", "a100", ["v100", "a10", "a100"])
 
 # COMMAND ----------
 
@@ -112,9 +113,6 @@
 
 checkpoint_dir_name = f"{model_name}__{timestamp}"
 
-root_path = os.getcwd()
-deepspeed_config = os.path.join(root_path, "config/ds_z3_bf16_config.json")
-
 dolly_training_dir_name = "dolly_training"
 
 # Use the local training root path if it was provided.  Otherwise try to find a sensible default.
@@ -136,19 +134,32 @@
 
 local_output_dir = os.path.join(local_training_root, checkpoint_dir_name)
 dbfs_output_dir = os.path.join(dbfs_output_root, checkpoint_dir_name)
+tensorboard_display_dir = f"{local_output_dir}/runs"
+
+print(f"Local Output Dir: {local_output_dir}")
+print(f"DBFS Output Dir: {dbfs_output_dir}")
+print(f"Tensorboard Display Dir: {tensorboard_display_dir}")
+
+# pick an appropriate config file
+gpu_family = dbutils.widgets.get("gpu_family")
+config_file_name = f"{gpu_family}_config.json"
+deepspeed_config = os.path.join(os.getcwd(), "config", config_file_name)
+print(f"Deepspeed config file: {deepspeed_config}")
+
+# configure the batch_size
+batch_size = 3
+if gpu_family == "a10":
+    batch_size = 4
+elif gpu_family == "a100":
+    batch_size = 6
 
+# configure num_gpus, if specified
 num_gpus_flag = ""
 num_gpus = dbutils.widgets.get("num_gpus")
 if num_gpus:
     num_gpus = int(num_gpus)
     num_gpus_flag = f"--num_gpus={num_gpus}"
 
-tensorboard_display_dir = f"{local_output_dir}/runs"
-
-print(f"Local Output Dir: {local_output_dir}")
-print(f"DBFS Output Dir: {dbfs_output_dir}")
-print(f"Tensorboard Display Dir: {tensorboard_display_dir}")
-
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 # COMMAND ----------
@@ -158,28 +169,28 @@
 
 # COMMAND ----------
 
-# MAGIC !deepspeed {num_gpus_flag} \
-# MAGIC     --module training.trainer \
-# MAGIC     --input-model {input_model} \
-# MAGIC     --deepspeed {deepspeed_config} \
-# MAGIC     --epochs 2 \
-# MAGIC     --local-output-dir {local_output_dir} \
-# MAGIC     --dbfs-output-dir {dbfs_output_dir} \
-# MAGIC     --per-device-train-batch-size 6 \
-# MAGIC     --per-device-eval-batch-size 6 \
-# MAGIC     --logging-steps 10 \
-# MAGIC     --save-steps 200 \
-# MAGIC     --save-total-limit 20 \
-# MAGIC     --eval-steps 50 \
-# MAGIC     --warmup-steps 50 \
-# MAGIC     --test-size 200 \
-# MAGIC     --lr 5e-6
+!deepspeed {num_gpus_flag} \
+    --module training.trainer \
+    --input-model {input_model} \
+    --deepspeed {deepspeed_config} \
+    --epochs 2 \
+    --local-output-dir {local_output_dir} \
+    --dbfs-output-dir {dbfs_output_dir} \
+    --per-device-train-batch-size {batch_size} \
+    --per-device-eval-batch-size {batch_size} \
+    --logging-steps 10 \
+    --save-steps 200 \
+    --save-total-limit 20 \
+    --eval-steps 50 \
+    --warmup-steps 50 \
+    --test-size 200 \
+    --lr 5e-6
 
 # COMMAND ----------
 
 from training.generate import generate_response, load_model_tokenizer_for_generate
 
-model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)
+model, tokenizer = load_model_tokenizer_for_generate(dbfs_output_dir)
 
 # COMMAND ----------
 
@@ -192,8 +203,19 @@
     "Give me a list of 5 science fiction books I should read next.",
 ]
 
+# set some additional pipeline args
+pipeline_kwargs = {'torch_dtype': "auto"}
+if gpu_family == "v100":
+    pipeline_kwargs['torch_dtype'] = "float16"
+elif gpu_family == "a10" or gpu_family == "a100":
+    pipeline_kwargs['torch_dtype'] = "bfloat16"
+
 # Use the model to generate responses for each of the instructions above.
 for instruction in instructions:
-    response = generate_response(instruction, model=model, tokenizer=tokenizer)
+    response = generate_response(instruction, model=model, tokenizer=tokenizer, **pipeline_kwargs)
     if response:
         print(f"Instruction: {instruction}\n\n{response}\n\n-----------\n")
+
+# COMMAND ----------
+
+
diff --git a/training/trainer.py b/training/trainer.py
@@ -232,14 +232,17 @@ def train(
         tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
     )
 
+    # enable fp16 if not bf16
+    fp16 = not bf16
+
     if not dbfs_output_dir:
         logger.warn("Will NOT save to DBFS")
 
     training_args = TrainingArguments(
         output_dir=local_output_dir,
         per_device_train_batch_size=per_device_train_batch_size,
         per_device_eval_batch_size=per_device_eval_batch_size,
-        fp16=False,
+        fp16=fp16,
         bf16=bf16,
         learning_rate=lr,
         num_train_epochs=epochs,
@@ -316,7 +319,7 @@ def train(
     default=True,
     help="Provided by deepspeed to identify which instance this process is when performing multi-GPU training.",
 )
-@click.option("--bf16", type=bool, default=True, help="Whether to use bf16 (preferred on A100's).")
+@click.option("--bf16", type=bool, default=None, help="Whether to use bf16 (preferred on A100's).")
 def main(**kwargs):
     train(**kwargs)