NVIDIA
diff --git a/‎examples/llm_qat/accelerate_config/deepspeed.yaml‎
Lines changed: 23 additions & 0 deletions b/‎examples/llm_qat/accelerate_config/deepspeed.yaml‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎examples/llm_qat/convert_sharded_ckpt.py‎
Lines changed: 0 additions & 55 deletions b/‎examples/llm_qat/convert_sharded_ckpt.py‎
Lines changed: 0 additions & 55 deletions
diff --git a/‎examples/llm_qat/launch.sh‎
Lines changed: 52 additions & 23 deletions b/‎examples/llm_qat/launch.sh‎
Lines changed: 52 additions & 23 deletions
diff --git a/‎examples/llm_qat/main.py‎
Lines changed: 11 additions & 19 deletions b/‎examples/llm_qat/main.py‎
Lines changed: 11 additions & 19 deletions
diff --git a/‎examples/llm_qat/simple_qat_train.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/llm_qat/simple_qat_train.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/llm_qat/utils.py‎
Lines changed: 6 additions & 0 deletions b/‎examples/llm_qat/utils.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎modelopt/torch/opt/conversion.py‎
Lines changed: 1 addition & 1 deletion b/‎modelopt/torch/opt/conversion.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/opt/plugins/peft.py‎
Lines changed: 3 additions & 8 deletions b/‎modelopt/torch/opt/plugins/peft.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎modelopt/torch/quantization/nn/modules/quant_module.py‎
Lines changed: 4 additions & 2 deletions b/‎modelopt/torch/quantization/nn/modules/quant_module.py‎
Lines changed: 4 additions & 2 deletions
@@ -0,0 +1,23 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: gpu
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -100,14 +100,18 @@ while [ $# -gt 0 ]; do
       if [[ "$1" != *=* ]]; then shift; fi
       FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP="${1#*=}"
       ;;
-    --use_fsdp2*)
-      if [[ "$1" != *=* ]]; then shift; fi
-      USE_FSDP2="${1#*=}"
-      ;;
     --max_seq_length*)
       if [[ "$1" != *=* ]]; then shift; fi
       MAX_SEQ_LENGTH="${1#*=}"
       ;;
+    --backend*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      BACKEND="${1#*=}"
+      ;;
+      --use_fsdp2*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      USE_FSDP2="${1#*=}"
+      ;;
     *)
       >&2 printf "Error: Invalid argument ${1#*=}\n"
       exit 1
@@ -142,6 +146,7 @@ COMPRESS=${COMPRESS:-"False"}
 DISTILL=${DISTILL:-"False"}
 TEACHER_MODEL=${TEACHER_MODEL:-$MODEL}
 FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP=${FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP:-"LlamaDecoderLayer"}
+BACKEND=${BACKEND:-"fsdp1"}
 
 if [ -z $QUANT_CFG ]; then
   QUANT_ARGS=""
@@ -154,31 +159,56 @@ if [ ! -z $MAX_STEPS ]; then
   OPTIONAL_ARGS="$OPTIONAL_ARGS --max_steps $MAX_STEPS"
 fi
 
-CONFIG_FILE="fsdp1.yaml"
-FSDP_ARGS="--fsdp_transformer_layer_cls_to_wrap $FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP"
-GRADIENT_CHECKPOINTING_ARGS="--gradient_checkpointing True"
-
+# Set backend based on --backend parameter, with backward compatibility for --use_fsdp2
 if [[ "${USE_FSDP2,,}" == "true" ]]; then
-  echo "Using FSDP2 instead of FSDP1. FSDP2 is not mature yet! Please use it with latest torch and transformers."
-  CONFIG_FILE="fsdp2.yaml"
-  GRADIENT_CHECKPOINTING_ARGS=""
+  echo "Warning: --use_fsdp2 is deprecated. Use --backend=fsdp2 instead."
+  BACKEND="fsdp2"
+fi
+
+# if compress is true, set backend to ddp
+if [[ "${COMPRESS,,}" == "true" ]]; then
+  BACKEND="ddp"
 fi
 
+# Configure backend-specific settings
+case "${BACKEND,,}" in
+  "fsdp1"|"fsdp")
+    CONFIG_FILE="fsdp1.yaml"
+    FSDP_ARGS="--fsdp_transformer_layer_cls_to_wrap $FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP"
+    GRADIENT_CHECKPOINTING_ARGS="--gradient_checkpointing True"
+    ;;
+  "fsdp2")
+    echo "Using FSDP2 instead of FSDP1. FSDP2 is not mature yet! Please use it with latest torch and transformers."
+    CONFIG_FILE="fsdp2.yaml"
+    FSDP_ARGS="--fsdp_transformer_layer_cls_to_wrap $FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP"
+    GRADIENT_CHECKPOINTING_ARGS=""
+    ;;
+  "ddp")
+    CONFIG_FILE="ddp.yaml"
+    FSDP_ARGS=""
+    GRADIENT_CHECKPOINTING_ARGS="--gradient_checkpointing True"
+    ;;
+  "deepspeed")
+    CONFIG_FILE="deepspeed.yaml"
+    FSDP_ARGS=""
+    GRADIENT_CHECKPOINTING_ARGS="--gradient_checkpointing True"
+    ;;
+  *)
+    echo "Error: Invalid backend '$BACKEND'. Supported backends: fsdp1, fsdp2, ddp, deepspeed"
+    exit 1
+    ;;
+esac
+
+# TODO: Remove this after simple distillation is supported
 DISTILLATION_ARGS=""
 if [[ "${DISTILL,,}" == "true" ]]; then
   DISTILLATION_ARGS="--distill $DISTILL --teacher_model $TEACHER_MODEL"
-  # Distillation does not work with memory efficient loading
-  FSDP_ARGS="$FSDP_ARGS --fsdp_cpu_ram_efficient_loading False"
+  # Distillation does not work with memory efficient loading for FSDP
+  if [[ "${BACKEND,,}" == "fsdp1" || "${BACKEND,,}" == "fsdp2" ]]; then
+    FSDP_ARGS="$FSDP_ARGS --fsdp_cpu_ram_efficient_loading False"
+  fi
 fi
 
-# real quantization does not work with FSDP, only works with FSDP2
-if [[ "${COMPRESS,,}" == "true" && "${USE_FSDP2,,}" != "true" ]]; then
-  echo "Compression is not supported with FSDP. Disabling FSDP and using DDP."
-  FSDP_ARGS=""
-  CONFIG_FILE="ddp.yaml"
-fi
-
-
 CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \
     main.py \
     --model_name_or_path $MODEL \
@@ -214,5 +244,4 @@ CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \
 
 start_time=$(date +%s)
 sh -c "$CMD"
-echo "Total time taken: $(( $(date +%s) - $start_time )) seconds"
-python convert_sharded_ckpt.py --hf_model_path $MODEL --sharded_ckpt_path $OUTPUT_DIR --output_path $OUTPUT_DIR
+echo "Total time taken: $(( $(date +%s) - $start_time )) seconds"
@@ -38,18 +38,15 @@
 from transformers.trainer_utils import get_last_checkpoint
 from utils import (
     get_lora_config,
+    get_metrics_with_perplexity,
     make_supervised_data_module,
     monkey_patch_training_step_to_fix_memory_leak,
 )
 
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
 from modelopt.torch.distill.plugins.huggingface import LMLogitsLoss
-from modelopt.torch.quantization.plugins.transformers_trainer import (
-    QADTrainer,
-    QATTrainer,
-    get_metrics_with_perplexity,
-)
+from modelopt.torch.quantization.plugins.transformers_trainer import QADTrainer, QATTrainer
 from modelopt.torch.utils import print_rank_0
 
 # Enable automatic save/load of modelopt state huggingface checkpointing
@@ -263,22 +260,17 @@ def train():
 
     if training_args.do_train:
         trainer.train(resume_from_checkpoint=checkpoint)
+        print_rank_0("Training completed.")
 
     if training_args.do_eval:
-        if not training_args.do_train:
-            # trainer.evaluate() will not prepare the model properly, especially for FSDP2,
-            # so we use the ``eval_on_start`` flag to evaluate the model and skip the training.
-            trainer.train(resume_from_checkpoint=checkpoint, eval_only=True)
-        else:
-            metrics = trainer.evaluate()
-            metrics = get_metrics_with_perplexity(metrics)
-            print_rank_0(f"Evaluation results: \n{metrics}")
-
-    if training_args.do_train or quant_args.quant_cfg is not None:
-        print_rank_0("Saving the model...")
-        trainer.save_state()
-        kwargs = {"export_student": True} if training_args.distill else {}
-        trainer.save_model(training_args.output_dir, **kwargs)
+        metrics = trainer.evaluate()
+        metrics = get_metrics_with_perplexity(metrics)
+        print_rank_0(f"Evaluation results: \n{metrics}")
+
+    print_rank_0("Saving the model...")
+    trainer.save_state()
+    kwargs = {"export_student": True} if training_args.distill else {}
+    trainer.save_model(training_args.output_dir, **kwargs)
 
 
 if __name__ == "__main__":
 
@@ -87,7 +87,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--quant-cfg",
         type=str,
-        default=mtq.NVFP4_DEFAULT_CFG,
+        default="NVFP4_DEFAULT_CFG",
         choices=mtq.config.choices,
         help="Quantization configuration",
     )
@@ -121,7 +121,7 @@ def calibrate(m: nn.Module):
             m(batch["input_ids"].to(device))
 
     # Quantize the model
-    model = mtq.quantize(model, args.quant_cfg, calibrate)
+    model = mtq.quantize(model, getattr(mtq, args.quant_cfg), calibrate)
 
     # Initialize optimizer
     optimizer = AdamW(model.parameters(), lr=args.lr)
 
@@ -167,3 +167,9 @@ def new_func(original_f_name, trainer, *args, **kwargs):
         setattr(
             trainer, f_name, types.MethodType(partial(new_func, "_original_" + f_name), trainer)
         )
+
+
+def get_metrics_with_perplexity(metrics):
+    """Add perplexity to the metrics."""
+    metrics = {"perplexity": float(torch.exp(torch.tensor(metrics["eval_loss"]))), **metrics}
+    return metrics
@@ -380,7 +380,7 @@ def apply_mode(
         return model.init_modellike() if isinstance(model, ModelLikeModule) else model
 
     # check if the model is in a wrapper
-    model = unwrap_model(model, raise_error=True)
+    model = unwrap_model(model, force_unwrap=True)
 
     # standardize mode to ModeConfigList
     mode_and_config = get_mode_config(mode)
 
@@ -57,14 +57,9 @@ def _new_save_pretrained_peft(self, save_directory, *args, **kwargs):
     # So we need to save the quantizer state_dict separately
 
     # TODO: Move this to modelopt.torch.quantization.plugins.peft
-    from modelopt.torch.quantization.nn import TensorQuantizer
-
-    # We should not call self/model.state_dict() here. HF Trainer calls model.save_pretrained() only from process 0
-    # With FSDP, model.state_dict() will hang if it is not called from all processes
-    quantizer_state_dict = {}
-    for name, module in self.named_modules():
-        if isinstance(module, TensorQuantizer):
-            quantizer_state_dict[get_unwrapped_name(name)] = module.state_dict()
+    from modelopt.torch.quantization.utils import get_quantizer_state_dict
+
+    quantizer_state_dict = get_quantizer_state_dict(self)
     if len(quantizer_state_dict) > 0:
         torch.save(quantizer_state_dict, _get_quantizer_state_save_path(save_directory))
     return outputs
 
@@ -158,8 +158,10 @@ class QuantLinearConvBase(QuantInputBase):
     def quantize_weight(self):
         """Context in which `self.weight` is quantized."""
         self._enable_weight_quantization = True
-        yield
-        self._enable_weight_quantization = False
+        try:
+            yield
+        finally:
+            self._enable_weight_quantization = False
 
     @staticmethod
     def _get_quantized_weight(module: "QuantLinearConvBase", weight: torch.Tensor) -> torch.Tensor: