add file; only d2t when training

h-guo18 · h-guo18 · commit 52dbdec5c6af · 2025-09-05T19:08:39.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eo pipefail
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --training_seq_len*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      TRAINING_SEQ_LEN="${1#*=}"
+      ;;
+    --model*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      MODEL="${1#*=}"
+      ;;
+    --data*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      DATA="${1#*=}"
+      ;;
+    --mode*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      MODE="${1#*=}"
+      ;;
+    --output_dir*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      OUTPUT_DIR="${1#*=}"
+      ;;
+    --num_epochs*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      NUM_EPOCHS="${1#*=}"
+      ;;
+    --save_steps*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      SAVE_STEPS="${1#*=}"
+      ;;
+    --lr*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      LR="${1#*=}"
+      ;;
+    --train_bs*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      TRAIN_BS="${1#*=}"
+      ;;
+    --medusa_num_heads*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      MEDUSA_NUM_HEADS="${1#*=}"
+      ;;
+    --medusa_num_layers*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      MEDUSA_NUM_LAYERS="${1#*=}"
+      ;;
+    --eagle_config*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_CONFIG="${1#*=}"
+      ;;
+    --fsdp_transformer_layer_cls_to_wrap*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP="${1#*=}"
+      ;;
+    --num_gpu*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      NUM_GPU="${1#*=}"
+      ;;
+    *)
+      >&2 printf "Error: Invalid argument ${1#*=}\n"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+set -x
+
+# Get the default value for save_steps based on the available number of GPUs
+GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())")
+# Calculate save_steps
+DEFAULT_SAVE_STEPS=$((8192 / GPU_COUNT))
+
+MODEL=${MODEL:-"TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
+MODE=${MODE:-"eagle3"}
+# Set default OUTPUT_DIR to ckpts/{modelname}, where {modelname} is the last part of the model path
+MODEL_BASENAME=$(basename "$MODEL")
+OUTPUT_DIR=${OUTPUT_DIR:-"ckpts/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)"}
+NUM_EPOCHS=${NUM_EPOCHS:-1}
+SAVE_STEPS=${SAVE_STEPS:-$DEFAULT_SAVE_STEPS}
+LR=${LR:-"1e-4"}
+TRAIN_BS=${TRAIN_BS:-4}
+MEDUSA_NUM_HEADS=${MEDUSA_NUM_HEADS:-1}
+MEDUSA_NUM_LAYERS=${MEDUSA_NUM_LAYERS:-1}
+REDRAFTER_TOKENS=${REDRAFTER_TOKENS:-1}
+REDRAFTER_NUM_LAYERS=${REDRAFTER_NUM_LAYERS:-1}
+FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP=${FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP:-"LlamaDecoderLayer"}
+NUM_GPU=${NUM_GPU:-1}
+TRAINING_SEQ_LEN=${TRAINING_SEQ_LEN:-512}
+
+if [[ "$MODE" == "medusa" ]]; then
+  SPECULATIVE_ARGS="--medusa_num_heads $MEDUSA_NUM_HEADS --medusa_num_layers $MEDUSA_NUM_LAYERS"
+elif [[ "$MODE" == "eagle1" || "$MODE" == "eagle3" ]]; then
+  if [[ -n "$EAGLE_CONFIG" ]]; then
+    SPECULATIVE_ARGS="--eagle_config $EAGLE_CONFIG"
+  else
+    SPECULATIVE_ARGS=""
+  fi
+else
+  echo "Only medusa, eagle1, eagle3 supported for now!"
+  exit 1
+fi
+
+if [[ "$NUM_GPU" == 1 ]]; then
+  MULTI_GPU=""
+else
+  MULTI_GPU="--multi_gpu"
+fi
+
+# Disable tokenizers parallelism to avoid warning
+export TOKENIZERS_PARALLELISM=False
+CMD="accelerate launch $MULTI_GPU --mixed_precision bf16 main.py \
+    --mode $MODE \
+    --model_name_or_path $MODEL \
+    --training_seq_len $TRAINING_SEQ_LEN \
+    --dataloader_drop_last True \
+    --bf16 True \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs $NUM_EPOCHS \
+    --per_device_train_batch_size $TRAIN_BS \
+    --per_device_eval_batch_size $TRAIN_BS \
+    --gradient_accumulation_steps 1 \
+    --do_eval False \
+    --eval_accumulation_steps 1 \
+    --save_strategy steps \
+    --save_steps $SAVE_STEPS \
+    --learning_rate $LR \
+    --weight_decay 0.0 \
+    --warmup_steps 100 \
+    --lr_scheduler_type linear \
+    --logging_steps 100 \
+    --tf32 True \
+    --data_path $DATA \
+    $SPECULATIVE_ARGS
+"
+
+start_time=$(date +%s)
+sh -c "$CMD"
+echo "Total time taken: $(( $(date +%s) - $start_time )) seconds"
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
@@ -713,7 +713,7 @@ def _base_model_forward(
                 base_model_loss = loss_fct(loss_logits, labels)
 
         # Map the base model logits to the draft vocab
-        if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size:
+        if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size and self.training:
             base_model_logits = self._map_logits_to_draft_vocab(base_model_logits)
 
         return base_model_hidden_states, base_model_logits, base_model_loss, past_key_values