From f4cadcca3f180c8888b098fbfa4b2f4b52fa88a2 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 15 Sep 2025 12:06:23 -0700 Subject: [PATCH 01/27] remove detachedeagle and handle all offline mode in the eagle model Signed-off-by: Ye Yu --- .../torch/speculative/eagle/conversion.py | 11 +- .../speculative/plugins/megatron_eagle.py | 496 ++---------------- 2 files changed, 62 insertions(+), 445 deletions(-) diff --git a/modelopt/torch/speculative/eagle/conversion.py b/modelopt/torch/speculative/eagle/conversion.py index a033386d4..ecc0c37d0 100644 --- a/modelopt/torch/speculative/eagle/conversion.py +++ b/modelopt/torch/speculative/eagle/conversion.py @@ -24,7 +24,6 @@ from ..config import EagleConfig EagleDMRegistry = _DMRegistryCls(prefix="Eagle") # global instance for the registry -OfflineEagleDMRegistry = _DMRegistryCls(prefix="DetachedEagle") # global instance for the registry def convert_to_eagle_model(model: nn.Module, config: EagleConfig) -> ConvertReturnType: @@ -32,16 +31,14 @@ def convert_to_eagle_model(model: nn.Module, config: EagleConfig) -> ConvertRetu # initialize the true module if necessary model = model.init_modellike() if isinstance(model, ModelLikeModule) else model - registry = OfflineEagleDMRegistry if config.eagle_offline else EagleDMRegistry - original_cls = type(model) - if original_cls not in registry: - for cls in registry._registry: + if original_cls not in EagleDMRegistry: + for cls in EagleDMRegistry._registry: if issubclass(original_cls, cls): - registry.register({original_cls: "base_model_class"})(registry[cls]) + EagleDMRegistry.register({original_cls: "base_model_class"})(EagleDMRegistry[cls]) break - eagle_model = registry.convert(model) + eagle_model = EagleDMRegistry.convert(model) eagle_model.modify( eagle_offline=config.eagle_offline, eagle_hidden_state_distillation=config.eagle_hidden_state_distillation, diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index c2447367e..320c17fe6 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -52,7 +52,7 @@ from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint from packaging.version import Version -from ..eagle.conversion import EagleDMRegistry, OfflineEagleDMRegistry +from ..eagle.conversion import EagleDMRegistry from ..eagle.eagle_model import EagleModel from ..utils import ( AcceptanceRateValidation, @@ -745,6 +745,10 @@ def modify( eagle_architecture_config=eagle_architecture_config, ) + # sequence_parallel is not used in offline eagle + if self.eagle_offline: + self.config.sequence_parallel = False + self.eagle_config = dict_to_config( eagle_architecture_config, self.config.use_cpu_initialization, @@ -760,6 +764,7 @@ def modify( # Use default aux_hidden_state layers if use_aux_hidden_state is True # but no layer id is given + # layer ids are not used in detached eagle, but we need to set this to have correct fc_input_size_multiplier if ( self.eagle_config.use_aux_hidden_state and len(self.eagle_config.eagle_aux_hidden_state_layer_ids) == 0 @@ -810,6 +815,8 @@ def modify( self.eagle_config.eagle_aux_hidden_state_layer_ids ) eagle_config.use_mtp_layernorm = self.eagle_config.use_mtp_layernorm + eagle_config.draft_vocab_size = self.eagle_config.draft_vocab_size + eagle_config.has_lm_head = self.eagle_config.has_lm_head self.eagle_module = EagleModule( eagle_config, self.rotary_pos_emb, @@ -843,18 +850,19 @@ def _get_eagle_input_hidden_states(self, hidden_states: torch.Tensor, apply_fc: hidden_states: last hidden_states apply_fc: whether to apply EAGLE3 fc """ - if len(self._aux_hidden_states) == 0: - return hidden_states + if not self.eagle_offline: + if len(self._aux_hidden_states) == 0: + return hidden_states - # [s / TP, b, len(self._aux_hidden_states) * h] - aux_hidden_states = torch.cat(self._aux_hidden_states, dim=-1) - self._aux_hidden_states.clear() + # [s / TP, b, len(self._aux_hidden_states) * h] + hidden_states = torch.cat(self._aux_hidden_states, dim=-1) + self._aux_hidden_states.clear() if apply_fc: # [s / TP, b, 3h] -> [s / TP, b, h] - return self.eagle_module.fc(aux_hidden_states)[0] + return self.eagle_module.fc(hidden_states)[0] else: - return aux_hidden_states + return hidden_states def _get_eagle_module_inputs( self, @@ -1065,13 +1073,15 @@ def _compute_eagle_loss(self, logits, labels, eagle_logits): """Compute the total loss for EAGLE. logits: [s, b, vocab // TP] - labels: [b, s] + labels: [b, s] or [b, s-1] for offline mode eagle_logits: [s, b, vocab // TP] """ # Compute lm loss (classification loss) or KLDivergence if self.eagle_self_logit_distillation: mapping = self.eagle_module.d2t if hasattr(self.eagle_module, "d2t") else None token_loss = self.kld(eagle_logits[:-1, :, :], logits[1:, :, :], mapping) + elif labels.shape[1] < eagle_logits.shape[0]: + token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-2, :, :]) else: token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-1, :, :]) @@ -1183,34 +1193,44 @@ def forward( return_eagle_inputs: bool = False, **kwargs, ) -> torch.Tensor: - if input_ids is not None and (position_ids is None or attention_mask is None): + if position_ids is None or attention_mask is None: attention_mask, position_ids = get_default_attention_mask_and_position_ids(input_ids) - # When return_eagle_inputs is True, return decoder_input_for_eagle. - # When LLM, decoder_input_for_eagle is just the text embeddings. However, when VLM - # decoder_input_for_eagle will also contain projected image/video embeddings. - hidden_states, decoder_input_for_eagle = self._base_model_forward( - input_ids, - position_ids, - attention_mask, - decoder_input, - inference_params, - packed_seq_params, - extra_block_kwargs, - return_eagle_inputs=return_eagle_inputs, - ) + if self.eagle_offline: + # aux_hidden_states and hidden_states are provided for offline eagle + # _base_model_forward is skipped + aux_hidden_states = kwargs.get("aux_hidden_states") + hidden_states = kwargs.get("hidden_states") + else: + # When return_eagle_inputs is True, return decoder_input_for_eagle. + # For LLM, decoder_input_for_eagle is just the text embeddings. However, for VLM + # decoder_input_for_eagle will also contain projected image/video embeddings. + hidden_states, decoder_input_for_eagle = self._base_model_forward( + input_ids, + position_ids, + attention_mask, + decoder_input, + inference_params, + packed_seq_params, + extra_block_kwargs, + return_eagle_inputs=return_eagle_inputs, + ) - # Typically, this is only the case when PP > 1. - if not self.post_process: - return hidden_states + # Typically, this is only the case when PP > 1. + if not self.post_process: + return hidden_states output_weight = None if self.share_embeddings_and_output_weights: output_weight = self.shared_embedding_or_output_weight() logits_sbh, _ = self.output_layer(hidden_states, weight=output_weight) + if self.eagle_offline: + eagle_module_input_hidden_states = self._get_eagle_input_hidden_states( + aux_hidden_states, apply_fc=True + ) # If EAGLE-3, aux_hidden_states are gathered by the forward_hook - if return_eagle_inputs: + elif return_eagle_inputs: eagle_module_input_hidden_states = self._get_eagle_input_hidden_states( hidden_states, apply_fc=False ) @@ -1232,7 +1252,7 @@ def forward( hidden_states, apply_fc=True ) - # Either inference or calibration mode, we want to make sure all weights have been exercised. + # In calibration mode, we want to make sure all weights have been exercised. # This makes sure all quantized weights have amax calibrated if inference_params is None or self.calibration_mode: eagle_inputs_0 = self._get_eagle_module_inputs( @@ -1262,7 +1282,9 @@ def forward( if self.eagle_config.parallel_draft_step > 1: for i in range(self.eagle_config.parallel_draft_step): - eagle_logits = eagle_logits_0[i * labels.shape[1] : (i + 1) * labels.shape[1]] + eagle_logits = eagle_logits_0[ + i * logits_sbh.shape[0] : (i + 1) * logits_sbh.shape[0] + ] loss_ = self._compute_eagle_loss(logits_sbh, labels, eagle_logits) loss_ = loss_[:, i:] loss[:, i + 1 :] += 1.0 * loss_ @@ -1275,7 +1297,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_0[:-1, :, :] + eagle_logits_0[:-2, :, :] if self.eagle_offline else eagle_logits_0[:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1305,7 +1327,7 @@ def forward( packed_seq_params=packed_seq_params, **(extra_block_kwargs or {}), ) - eagle_logits_1 = eagle_logits_2x[labels.shape[1] :, :, :] + eagle_logits_1 = eagle_logits_2x[logits_sbh.shape[0] :, :, :] loss_1 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_1) # [b, s - 2] @@ -1316,7 +1338,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_1[1:-1, :, :] + eagle_logits_1[1:-2, :, :] if self.eagle_offline else eagle_logits_1[1:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1347,7 +1369,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_2 = eagle_logits_3x[-labels.shape[1] :, :, :] + eagle_logits_2 = eagle_logits_3x[-logits_sbh.shape[0] :, :, :] loss_2 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_2) # [b, s - 3] @@ -1358,7 +1380,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_2[2:-1, :, :] + eagle_logits_2[2:-2, :, :] if self.eagle_offline else eagle_logits_2[2:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1389,7 +1411,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_3 = eagle_logits_4x[-labels.shape[1] :, :, :] + eagle_logits_3 = eagle_logits_4x[-logits_sbh.shape[0] :, :, :] loss_3 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_3) # [b, s - 4] @@ -1400,7 +1422,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_3[3:-1, :, :] + eagle_logits_3[3:-2, :, :] if self.eagle_offline else eagle_logits_3[3:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1713,408 +1735,6 @@ def pseudo_speculative_generate( return base_token, draft_tokens -@OfflineEagleDMRegistry.register({GPTModel: "megatron.core.models.gpt.GPTModel"}) -class _DetachedEagleGPTModel(_DynamicEagleGPTModel): - """A wrapper for detached Eagle module.""" - - def modify( - self, - eagle_offline, - eagle_hidden_state_distillation, - eagle_self_logit_distillation, - eagle_freeze_base_model, - eagle_report_acc, - eagle_reuse_base_decoder, - eagle_loss_decay_factor, - eagle_architecture_config, - ): - super(_DynamicEagleGPTModel, self).modify( - eagle_offline=eagle_offline, - eagle_hidden_state_distillation=eagle_hidden_state_distillation, - eagle_self_logit_distillation=eagle_self_logit_distillation, - eagle_freeze_base_model=eagle_freeze_base_model, - eagle_report_acc=eagle_report_acc, - eagle_reuse_base_decoder=eagle_reuse_base_decoder, - eagle_loss_decay_factor=eagle_loss_decay_factor, - eagle_architecture_config=eagle_architecture_config, - ) - - # Freeze all parameters - if self.eagle_freeze_base_model: - for name, param in self.named_parameters(): - param.requires_grad = False - - self.eagle_config = dict_to_config( - eagle_architecture_config, - self.config.use_cpu_initialization, - self.config.fp16, - self.config.bf16, - ) - - assert not eagle_reuse_base_decoder, ( - "_DetachedEagleGPTModel does not have a base model so eagle_reuse_base_decoder must be False!" - ) - - if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: - assert eagle_self_logit_distillation, ( - "Only logit distillation is supported when draft_vocab_size != vocab_size!" - ) - - # Use default aux_hidden_state layers if use_aux_hidden_state is True - # but no layer id is given - # layer ids are not used in detached eagle, but we need to set this to have correct fc_input_size_multiplier - if ( - self.eagle_config.use_aux_hidden_state - and len(self.eagle_config.eagle_aux_hidden_state_layer_ids) == 0 - ): - self._set_default_aux_hidden_state_layers() - - # Only the last PP stage has the additional projection and decoder layer. - # This is to simplify the export. - if self.post_process: - rotary_pos_emb = RotaryEmbedding( - kv_channels=self.eagle_config.kv_channels, - rotary_percent=self.eagle_config.rotary_percent, - rotary_interleaved=False, - seq_len_interpolation_factor=None, - rotary_base=self.eagle_config.rotary_base, - rope_scaling=self.eagle_config.rope_scaling, - rope_scaling_factor=self.eagle_config.rope_scaling_factor, - use_cpu_initialization=self.eagle_config.use_cpu_initialization, - ) - - self.eagle_module = EagleModule( - self.eagle_config, - rotary_pos_emb, - bias=False, - ) - - # Eagle loss functions - self.kld = logits_kld_loss - - def _get_eagle_input_hidden_states(self, hidden_states: torch.Tensor, apply_fc: bool = True): - if apply_fc: - # [s / TP, b, 3h] -> [s / TP, b, h] - return self.eagle_module.fc(hidden_states)[0] - else: - return hidden_states - - def _get_detached_eagle_module_inputs( - self, - input_ids: torch.Tensor, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - position_ids: torch.Tensor, - features: torch.Tensor | None = None, - ): - """Getting EAGLE module inputs.""" - b = hidden_states.shape[1] - h = hidden_states.shape[2] - - # [b, 1] - id_padding = torch.zeros((b, 1), dtype=input_ids.dtype, device=input_ids.device) - padded_input_ids = torch.cat((input_ids[:, 1:], id_padding), dim=-1) - - rotary_pos_emb = self.eagle_module.rotary_pos_emb(padded_input_ids.shape[-1]) - - attn_mask = attention_mask.clone().detach() - attn_mask[:, :, :-1, :-1] = attention_mask[:, :, 1:, 1:] - attn_mask[:, :, -1, :] = True - attn_mask[:, :, :, -1] = True - - eagle_inputs = {} - - assert self.eagle_config.parallel_draft_step == 1, ( - "Detached Eagle module does not support parallel draft yet!" - ) - if features is None: - eagle_inputs["input_ids"] = padded_input_ids - eagle_inputs["hidden_states"] = hidden_states - eagle_inputs["attention_mask"] = attn_mask - eagle_inputs["position_ids"] = position_ids - eagle_inputs["rotary_pos_emb"] = rotary_pos_emb - elif features.shape[0] == hidden_states.shape[0]: - eagle_inputs["input_ids"] = torch.cat( - (padded_input_ids, padded_input_ids), - dim=-1, - ) - eagle_inputs["hidden_states"] = torch.cat( - ( - hidden_states, - torch.zeros((1, b, h), dtype=hidden_states.dtype, device=hidden_states.device), - features[:-1, :, :], - ), - dim=0, - ) - eagle_inputs["attention_mask"] = set_multi_step_attention_mask(attn_mask, 2) - eagle_inputs["position_ids"] = torch.cat((position_ids, position_ids), dim=-1) - - if rotary_pos_emb is not None: - eagle_inputs["rotary_pos_emb"] = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=0) - else: - # [TODO] (yeyu): there will be problem here with MLA - eagle_inputs["rotary_pos_emb"] = None - elif features.shape[0] == hidden_states.shape[0] * 2: - eagle_inputs["input_ids"] = torch.cat( - (padded_input_ids, padded_input_ids, padded_input_ids), - dim=-1, - ) - eagle_inputs["hidden_states"] = torch.cat( - ( - hidden_states, - torch.zeros((1, b, h), dtype=hidden_states.dtype, device=hidden_states.device), - features[:-1, :, :], - ), - dim=0, - ) - - eagle_inputs["attention_mask"] = set_multi_step_attention_mask(attn_mask, 3) - eagle_inputs["position_ids"] = torch.cat( - (position_ids, position_ids, position_ids), dim=-1 - ) - - if rotary_pos_emb is not None: - eagle_inputs["rotary_pos_emb"] = torch.cat( - (rotary_pos_emb, rotary_pos_emb, rotary_pos_emb), - dim=0, - ) - else: - # [TODO] (yeyu): there will be problem here with MLA - eagle_inputs["rotary_pos_emb"] = None - else: - eagle_inputs["input_ids"] = torch.cat( - (padded_input_ids, padded_input_ids, padded_input_ids, padded_input_ids), - dim=-1, - ) - eagle_inputs["hidden_states"] = torch.cat( - ( - hidden_states, - torch.zeros((1, b, h), dtype=hidden_states.dtype, device=hidden_states.device), - features[:-1, :, :], - ), - dim=0, - ) - - eagle_inputs["attention_mask"] = set_multi_step_attention_mask(attn_mask, 4) - eagle_inputs["position_ids"] = torch.cat( - (position_ids, position_ids, position_ids, position_ids), dim=-1 - ) - - if rotary_pos_emb is not None: - eagle_inputs["rotary_pos_emb"] = torch.cat( - (rotary_pos_emb, rotary_pos_emb, rotary_pos_emb, rotary_pos_emb), - dim=0, - ) - else: - # [TODO] (yeyu): there will be problem here with MLA - eagle_inputs["rotary_pos_emb"] = None - - eagle_inputs["embedding"] = self.embedding( - input_ids=eagle_inputs["input_ids"], - position_ids=eagle_inputs["position_ids"], - ) - - return eagle_inputs - - def forward( - self, - input_ids: torch.Tensor = None, - position_ids: torch.Tensor = None, - attention_mask: torch.Tensor = None, - decoder_input: torch.Tensor = None, - labels: torch.Tensor = None, - inference_params: InferenceParams = None, - packed_seq_params: PackedSeqParams = None, - extra_block_kwargs: dict | None = None, - return_eagle_inputs: bool = False, # Not used in Detached Eagle - **kwargs, - ) -> torch.Tensor: - assert "aux_hidden_states" in kwargs, ( - "aux_hidden_states is required as input to _DetachedEagleGPTModel" - ) - assert "hidden_states" in kwargs, ( - "hidden_states is required as input to _DetachedEagleGPTModel" - ) - aux_hidden_states = kwargs.get("aux_hidden_states") - hidden_states = kwargs.get("hidden_states") - - # Note: labels is 1 token shorter than logits in detached mode - - if position_ids is None or attention_mask is None: - attention_mask, position_ids = get_default_attention_mask_and_position_ids(input_ids) - - eagle_module_input_hidden_states = self._get_eagle_input_hidden_states(aux_hidden_states) - output_weight = None - if self.share_embeddings_and_output_weights: - output_weight = self.shared_embedding_or_output_weight() - logits_sbh, _ = self.output_layer(hidden_states, weight=output_weight) - - eagle_inputs_0 = self._get_detached_eagle_module_inputs( - input_ids=input_ids, - hidden_states=eagle_module_input_hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - ) - - _, eagle_logits_0, eagle_hidden_states_0_pre_norm = self._eagle_forward( - eagle_inputs_0, - None, - inference_params=inference_params, - packed_seq_params=packed_seq_params, - **(extra_block_kwargs or {}), - ) - - loss = torch.zeros(input_ids.shape).to(input_ids.device) - - loss_0 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_0) - loss[:, 1:] += self.eagle_loss_decay_factor * loss_0 - - if self.eagle_report_acc and not self.training: - acc = [] - with torch.no_grad(): - gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_0[:-2, :, :] - ) - eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) - if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: - eagle_top1 += self.eagle_module.d2t[eagle_top1] - top1_p = torch.eq(labels[:, 1:], eagle_top1).sum() / eagle_top1.numel() - acc.append(top1_p) - - if get_tensor_model_parallel_rank() == 0: - print( - f"{torch.distributed.get_rank():3}/{torch.distributed.get_world_size():3} EAGLE 1st Top-1: {acc}", - flush=True, - ) - - # Second round of EAGLE loss - eagle_inputs_1 = self._get_detached_eagle_module_inputs( - input_ids=input_ids, - hidden_states=eagle_module_input_hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - features=eagle_hidden_states_0_pre_norm, - ) - - _, eagle_logits_2x, eagle_hidden_states_2x_pre_norm = self._eagle_forward( - eagle_inputs_1, - None, - inference_params=inference_params, - packed_seq_params=packed_seq_params, - **(extra_block_kwargs or {}), - ) - eagle_logits_1 = eagle_logits_2x[logits_sbh.shape[0] :, :, :] - - loss_1 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_1) - # [b, s - 2] - loss_1 = loss_1[:, 1:] - loss[:, 2:] += self.eagle_loss_decay_factor**2 * loss_1 - - if self.eagle_report_acc and not self.training: - acc = [] - with torch.no_grad(): - gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_1[1:-2, :, :] - ) - eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) - if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: - eagle_top1 += self.eagle_module.d2t[eagle_top1] - top1_p = torch.eq(labels[:, 2:], eagle_top1).sum() / eagle_top1.numel() - acc.append(top1_p) - - if get_tensor_model_parallel_rank() == 0: - print( - f"{torch.distributed.get_rank():3}/{torch.distributed.get_world_size():3} EAGLE 2nd Top-1: {acc}", - flush=True, - ) - - # Third EAGLE loss - eagle_inputs_2 = self._get_detached_eagle_module_inputs( - input_ids=input_ids, - hidden_states=eagle_module_input_hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - features=eagle_hidden_states_2x_pre_norm, - ) - - _, eagle_logits_3x, eagle_hidden_states_3x_pre_norm = self._eagle_forward( - eagle_inputs_2, - None, - inference_params=inference_params, - packed_seq_params=packed_seq_params, - **(extra_block_kwargs or {}), - ) - - eagle_logits_2 = eagle_logits_3x[-logits_sbh.shape[0] :, :, :] - - loss_2 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_2) - # [b, s - 3] - loss_2 = loss_2[:, 2:] - loss[:, 3:] += self.eagle_loss_decay_factor**3 * loss_2 - - if self.eagle_report_acc and not self.training: - acc = [] - with torch.no_grad(): - gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_2[2:-2, :, :] - ) - eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) - if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: - eagle_top1 += self.eagle_module.d2t[eagle_top1] - top1_p = torch.eq(labels[:, 3:], eagle_top1).sum() / eagle_top1.numel() - acc.append(top1_p) - - if get_tensor_model_parallel_rank() == 0: - print( - f"{torch.distributed.get_rank():3}/{torch.distributed.get_world_size():3} EAGLE 3rd Top-1: {acc}", - flush=True, - ) - - # Forth EAGLE loss - eagle_inputs_3 = self._get_detached_eagle_module_inputs( - input_ids=input_ids, - hidden_states=eagle_module_input_hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - features=eagle_hidden_states_3x_pre_norm, - ) - - _, eagle_logits_4x, eagle_hidden_states_4x_pre_norm = self._eagle_forward( - eagle_inputs_3, - None, - inference_params=inference_params, - packed_seq_params=packed_seq_params, - **(extra_block_kwargs or {}), - ) - - eagle_logits_3 = eagle_logits_4x[-logits_sbh.shape[0] :, :, :] - - loss_3 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_3) - # [b, s - 4] - loss_3 = loss_3[:, 3:] - loss[:, 4:] += self.eagle_loss_decay_factor**4 * loss_3 - - if self.eagle_report_acc and not self.training: - acc = [] - with torch.no_grad(): - gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_3[3:-2, :, :] - ) - eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) - if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: - eagle_top1 += self.eagle_module.d2t[eagle_top1] - top1_p = torch.eq(labels[:, 4:], eagle_top1).sum() / eagle_top1.numel() - acc.append(top1_p) - - if get_tensor_model_parallel_rank() == 0: - print( - f"{torch.distributed.get_rank():3}/{torch.distributed.get_world_size():3} EAGLE 4th Top-1: {acc}", - flush=True, - ) - - return loss - - class MegatronARValidation(AcceptanceRateValidation): """This is the subclass for megatron model AR validation.""" From dd2cef655ff3e4b53a1f8f3a1124bd4aa1dbae11 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 15 Sep 2025 12:39:44 -0700 Subject: [PATCH 02/27] apply coderabbit suggestion Signed-off-by: Ye Yu --- modelopt/torch/speculative/plugins/megatron_eagle.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index 320c17fe6..1deb61100 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -1199,8 +1199,15 @@ def forward( if self.eagle_offline: # aux_hidden_states and hidden_states are provided for offline eagle # _base_model_forward is skipped + if return_eagle_inputs: + raise ValueError("return_eagle_inputs is unsupported in EAGLE offline mode.") aux_hidden_states = kwargs.get("aux_hidden_states") hidden_states = kwargs.get("hidden_states") + if aux_hidden_states is None or hidden_states is None: + raise ValueError( + "EAGLE offline mode requires kwargs: aux_hidden_states=[s,b,k*h], " + "hidden_states=[s,b,h]." + ) else: # When return_eagle_inputs is True, return decoder_input_for_eagle. # For LLM, decoder_input_for_eagle is just the text embeddings. However, for VLM From 6cf677eded419be7aa6212a217a259f0a10d06c2 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 15 Sep 2025 12:43:54 -0700 Subject: [PATCH 03/27] remove OfflineEagleDMRegistry Signed-off-by: Ye Yu --- modelopt/torch/speculative/plugins/transformers.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index e1da326d1..27f2011e1 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -46,7 +46,7 @@ from transformers.trainer_pt_utils import LabelSmoother from transformers.utils import ModelOutput -from ..eagle.conversion import EagleDMRegistry, OfflineEagleDMRegistry +from ..eagle.conversion import EagleDMRegistry from ..eagle.eagle_model import EagleModel from ..eagle.utils import RMSNorm, expand_mask, make_causal_mask from ..medusa.conversion import MedusaDMRegistry @@ -1141,13 +1141,6 @@ def pseudo_speculative_generate( return base_token, draft_tokens -@OfflineEagleDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"}) -class DetachedHFEagleModel(HFEagleModel): - """A wrapper for detached Eagle module.""" - - # TODO: Implement DetachedHFEagleModel class for offline eagle. - - class HFARValidation(AcceptanceRateValidation): """This is the subclass for HF model AR validation.""" From e959ae1e1f207e7af9b4b2314e8d6e8ff050e30e Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 15 Sep 2025 13:05:59 -0700 Subject: [PATCH 04/27] apply suggestion to cover eagle1 case Signed-off-by: Ye Yu --- modelopt/torch/speculative/plugins/megatron_eagle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index 1deb61100..a9bd45aff 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -844,7 +844,7 @@ def modify( self.kld = logits_kld_loss def _get_eagle_input_hidden_states(self, hidden_states: torch.Tensor, apply_fc: bool = True): - """When _aux_hidden_states is not empty, then this is EAGLE-3. + """When _aux_hidden_states is not empty for online, then this is EAGLE-3. Args: hidden_states: last hidden_states @@ -1234,7 +1234,7 @@ def forward( if self.eagle_offline: eagle_module_input_hidden_states = self._get_eagle_input_hidden_states( - aux_hidden_states, apply_fc=True + aux_hidden_states, apply_fc=self.eagle_config.use_aux_hidden_state ) # If EAGLE-3, aux_hidden_states are gathered by the forward_hook elif return_eagle_inputs: From 9ba9935b3f161336685fffe81cce3cfaed31aee6 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 15 Sep 2025 13:58:57 -0700 Subject: [PATCH 05/27] debug Signed-off-by: Ye Yu --- modelopt/torch/speculative/plugins/megatron_eagle.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index a9bd45aff..826c6188e 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -1284,7 +1284,9 @@ def forward( # If eagle_freeze_base_model is set to True, # the base model is frozen . - loss = self.compute_language_model_loss(labels, logits_sbh) + loss = self.compute_language_model_loss( + labels, logits_sbh[:-1] if self.eagle_offline else logits_sbh + ) loss = 0.0 * loss if self.eagle_config.parallel_draft_step > 1: From 0f8493a56d8ff3ec3a180574db806a1a7dd92b40 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 15 Sep 2025 14:02:16 -0700 Subject: [PATCH 06/27] debug Signed-off-by: Ye Yu --- modelopt/torch/speculative/plugins/megatron_eagle.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index 826c6188e..63664123a 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -1284,9 +1284,10 @@ def forward( # If eagle_freeze_base_model is set to True, # the base model is frozen . - loss = self.compute_language_model_loss( - labels, logits_sbh[:-1] if self.eagle_offline else logits_sbh - ) + if self.eagle_offline: + loss = torch.zeros(input_ids.shape).to(input_ids.device) + else: + loss = self.compute_language_model_loss(labels, logits_sbh) loss = 0.0 * loss if self.eagle_config.parallel_draft_step > 1: From 0670b8b001e80c28c3b676fb278e142a5769441f Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Tue, 16 Sep 2025 10:14:53 -0700 Subject: [PATCH 07/27] minor Signed-off-by: Ye Yu --- modelopt/torch/speculative/plugins/megatron_eagle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index 63664123a..4062be8b6 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -764,7 +764,7 @@ def modify( # Use default aux_hidden_state layers if use_aux_hidden_state is True # but no layer id is given - # layer ids are not used in detached eagle, but we need to set this to have correct fc_input_size_multiplier + # layer ids are not used in offline eagle, but we need to set this to have correct fc_input_size_multiplier if ( self.eagle_config.use_aux_hidden_state and len(self.eagle_config.eagle_aux_hidden_state_layer_ids) == 0 From 86bc66e6027d3a157bc7f163d1e2fead757d534b Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Tue, 16 Sep 2025 11:15:13 -0700 Subject: [PATCH 08/27] fix the bug in megatron import Signed-off-by: Ye Yu --- modelopt/torch/export/plugins/megatron_importer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py index 696af6323..31d748726 100644 --- a/modelopt/torch/export/plugins/megatron_importer.py +++ b/modelopt/torch/export/plugins/megatron_importer.py @@ -512,7 +512,10 @@ def _import_state_dict(self): self.rules["k_layernorm"](attention.k_layernorm, layer_id) self.rules["linear_qkv"](attention.linear_qkv, layer_id) self.rules["linear_proj"](attention.linear_proj, layer_id) - if hasattr(attention.core_attention, "softmax_offset"): + if ( + hasattr(attention.core_attention, "softmax_offset") + and attention.core_attention.softmax_offset is not None + ): self.rules["softmax_offset"]( attention.core_attention.softmax_offset, layer_id ) From a26a5a63f7f2bec12d810ae2447d1bdaba2cc310 Mon Sep 17 00:00:00 2001 From: kinjalpatel27 <31936134+kinjalpatel27@users.noreply.github.com> Date: Mon, 15 Sep 2025 23:36:50 -0700 Subject: [PATCH 09/27] Added support for qwen3-next quantization and export (#323) Signed-off-by: Kinjal Patel Signed-off-by: Ye Yu --- modelopt/torch/export/layer_utils.py | 16 ++++++++++++++-- .../torch/quantization/plugins/huggingface.py | 10 ++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index c35491283..e35ee070f 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -90,7 +90,12 @@ def get_experts_list(module: torch.nn.Module, model_type: str): linear_names = ["w1", "w2", "w3"] elif any( qwen_variant in model_type - for qwen_variant in ["qwenmoeforcausallm", "qwen2moeforcausallm", "qwen3moeforcausallm"] + for qwen_variant in [ + "qwenmoeforcausallm", + "qwen2moeforcausallm", + "qwen3moeforcausallm", + "qwen3nextforcausallm", + ] ): linear_names = ["gate_proj", "down_proj", "up_proj"] else: @@ -333,6 +338,7 @@ def is_moe(module: nn.Module) -> bool: "DeepseekMoE".lower(), "Qwen2MoeSparseMoeBlock".lower(), "Qwen3MoeSparseMoeBlock".lower(), + "Qwen3NextSparseMoeBlock".lower(), ] ) @@ -987,7 +993,13 @@ def module_match_name_list(module, name_list): return any(name.lower() in type(module).__name__.lower() for name in name_list) if module_match_name_list( - module, ["Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "DeepseekMoE"] + module, + [ + "Qwen2MoeSparseMoeBlock", + "Qwen3MoeSparseMoeBlock", + "Qwen3NextSparseMoeBlock", + "DeepseekMoE", + ], ): return ["gate_proj", "down_proj", "up_proj"] elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]): diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index da2a18c08..061e71dba 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -559,6 +559,16 @@ def top_k(self, value): except ImportError: pass +try: + from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextSparseMoeBlock + + if Qwen3NextSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register({Qwen3NextSparseMoeBlock: "hf.Qwen3NextSparseMoeBlock"})( + _QuantMoeSparseMoe + ) +except ImportError: + pass + class _QuantGptOssExperts(_QuantFunctionalMixin): """Quantized wrapper for `transformers.GptOssExperts`. From 742429d4ae194837feaeaf7d17ac1c8434423e2b Mon Sep 17 00:00:00 2001 From: Asha Anoosheh Date: Tue, 16 Sep 2025 12:56:23 +0200 Subject: [PATCH 10/27] Update distill Megatron plugin (#319) Signed-off-by: Asha Anoosheh Signed-off-by: Ye Yu --- modelopt/torch/distill/distillation_model.py | 9 +- modelopt/torch/distill/plugins/megatron.py | 413 +++++++++++++------ 2 files changed, 280 insertions(+), 142 deletions(-) diff --git a/modelopt/torch/distill/distillation_model.py b/modelopt/torch/distill/distillation_model.py index 339ff2c3e..930b68560 100644 --- a/modelopt/torch/distill/distillation_model.py +++ b/modelopt/torch/distill/distillation_model.py @@ -239,7 +239,7 @@ def compute_kd_loss( student_loss: torch.Tensor | None = None, loss_reduction_fn: Callable | None = None, skip_balancer: bool = False, - labels: torch.Tensor | None = None, + **loss_fn_kwargs, ) -> torch.Tensor | dict[str, torch.Tensor]: """Compute total loss for distillation backpropagation. @@ -248,8 +248,8 @@ def compute_kd_loss( loss_reduction_fn: Callable to be called on each loss tensor prior to balancing. Useful for loss-masking situations where the callable changes arguments each iteration. skip_balancer: Whether or not to use loss balancer to reduce the loss dict into a scalar. - labels: Labels to be passed to the loss function, if needed. This is necessary for losses that - require labels, such as MFTLoss. + **loss_fn_kwargs: Additional keyword arguments to be passed to the loss function, if needed. + This facilitates losses that require extras, such as labels for ``mtd.MFTLoss``. Returns: If reduce is True, the scalar total loss weighted between ``student_loss`` and the distillation losses. @@ -268,8 +268,7 @@ def compute_kd_loss( student_layer._intermediate_output = None teacher_layer._intermediate_output = None - extra_kwargs = {"labels": labels} if labels is not None else {} - loss = loss_fn(out_s, out_t, **extra_kwargs) # Student is pred, Teacher is target + loss = loss_fn(out_s, out_t, **loss_fn_kwargs) # Student is pred, Teacher is target if loss_reduction_fn is not None: # Needed in cases where a loss mask is used on non-scalar loss-fn outputs, prior to # reducing to a scalar loss value. diff --git a/modelopt/torch/distill/plugins/megatron.py b/modelopt/torch/distill/plugins/megatron.py index bde873004..6e712fcbb 100644 --- a/modelopt/torch/distill/plugins/megatron.py +++ b/modelopt/torch/distill/plugins/megatron.py @@ -18,29 +18,67 @@ """Distillation loss function(s).""" import logging -import types +import re from abc import ABCMeta -from typing import Any +from collections.abc import Callable +from dataclasses import dataclass, field +from types import MethodType +from typing import TYPE_CHECKING import torch import torch.nn as nn import torch.nn.functional as F import yaml -from megatron.core.dist_checkpointing.mapping import ShardedStateDict -from megatron.core.parallel_state import get_tensor_model_parallel_group -from megatron.core.tensor_parallel import gather_from_sequence_parallel_region -from megatron.core.transformer import MegatronModule, TransformerConfig +from megatron.core import parallel_state +from megatron.core.pipeline_parallel.schedules import get_tensor_shapes +from megatron.core.transformer import MegatronModule, TransformerLayer +from megatron.core.utils import get_model_config from torch import Tensor from torch.nn.modules.loss import _Loss import modelopt.torch.distill as mtd +from modelopt.torch.distill.config import Criterion + +if TYPE_CHECKING: + from megatron.core.dist_checkpointing.mapping import ShardedStateDict + from megatron.core.transformer import TransformerConfig + logger = logging.getLogger(__name__) +@dataclass +class DistillationConfig: + """Knowledge-Distillation config. + + Args: + intermediate_layer_pairs: List of tuples of intermediate layer names. + logit_layers: Tuple of logit layer names. + skip_lm_loss: Whether to skip computing the standard language model loss (default: ``True``). + kd_loss_scale: Relative scaling factor for the distillation loss if ``skip_lm_loss`` is ``False``. + logit_kl_temperature: Temperature for the logit KL-divergence loss. + """ + + intermediate_layer_pairs: list[tuple[str, str]] = field(default_factory=list) + logit_layers: tuple[str, str] = ("output_layer", "output_layer") + skip_lm_loss: bool = True + kd_loss_scale: float = 1.0 + logit_kl_temperature: float = 1.0 + criterion: Criterion | None = None + loss_balancer: mtd.DistillationLossBalancer | None = None + + def __post_init__(self): + assert len(self.logit_layers) == 2, f"{self.logit_layers=}" + assert all(len(pair) == 2 for pair in self.intermediate_layer_pairs), ( + f"{self.intermediate_layer_pairs=}" + ) + assert self.kd_loss_scale > 0, f"{self.kd_loss_scale=}" + assert self.logit_kl_temperature > 0, f"{self.logit_kl_temperature=}" + + def load_distillation_config( - config_path: str | None, student_cfg: TransformerConfig, teacher_cfg: TransformerConfig -) -> dict[str, Any]: + config_path: str | None, student_cfg: "TransformerConfig", teacher_cfg: "TransformerConfig" +) -> DistillationConfig: """Read the distillation yaml config file specified by ``args.export_kd_cfg``. Args: @@ -51,43 +89,64 @@ def load_distillation_config( WARNING: Assumes intermediate hidden sizes are always that found in the model config's ``hidden_size`` attribute. """ - if not config_path: - logger.warning("Distillation config not provided. Using default.") - cfg = { - "logit_layers": ["output_layer", "output_layer"], - "intermediate_layer_pairs": [], - "skip_lm_loss": True, - "kd_loss_scale": 1.0, - } - else: + if config_path: with open(config_path) as f: cfg = yaml.safe_load(f) + cfg = DistillationConfig(**cfg) + else: + logger.warning("Distillation config not provided. Using default.") + cfg = DistillationConfig() - intermediate_pairs: list[str] = cfg["intermediate_layer_pairs"] - logit_pair: list[str] = cfg["logit_layers"] - skip_lm_loss: bool = cfg["skip_lm_loss"] - loss_scale: float = cfg["kd_loss_scale"] - - criterion = {tuple(logit_pair): LogitsKLLoss(student_cfg, teacher_cfg)} - for layer_names in intermediate_pairs: - if torch.distributed.get_rank() == 0: - print( - "Distillation: Adding intermediate loss between" - f" `{layer_names[0]}` of student (hidden size {student_cfg.hidden_size}) and" - f" `{layer_names[1]}` of teacher (hidden size {teacher_cfg.hidden_size})." + criterion = {} + if student_cfg.pipeline_model_parallel_size == 1 or parallel_state.is_pipeline_last_stage(): + criterion[tuple(cfg.logit_layers)] = LogitsKLLoss( + student_cfg, temperature=cfg.logit_kl_temperature + ) + # NOTE: Projection layer shared among intermediate layer pairs. + projection_layer = ProjectionLayer(student_cfg, teacher_cfg) + + for student_layer, teacher_layer in cfg.intermediate_layer_pairs: + if parallel_state.get_tensor_and_context_parallel_rank() == 0: + logger.info( + "Distillation: Adding intermediate loss between" + f" `{student_layer}` of student (hidden size {student_cfg.hidden_size}) and" + f" `{teacher_layer}` of teacher (hidden size {teacher_cfg.hidden_size})." + ) + student_layer = _adjust_layer_index_for_pp(student_layer, student_cfg) + teacher_layer = _adjust_layer_index_for_pp(teacher_layer, teacher_cfg) + criterion[(student_layer, teacher_layer)] = HiddenStateCosineLoss( + student_cfg, projection_layer=projection_layer ) - criterion[tuple(layer_names)] = HiddenStateCosineLoss(student_cfg, teacher_cfg) loss_balancer = LogitsAndIntermediatesLossBalancer( - kd_loss_scale=loss_scale, skip_original_loss=skip_lm_loss + kd_loss_scale=cfg.kd_loss_scale, skip_original_loss=cfg.skip_lm_loss ) - cfg["criterion"] = criterion - cfg["loss_balancer"] = loss_balancer + cfg.criterion = criterion + cfg.loss_balancer = loss_balancer return cfg +def _adjust_layer_index_for_pp(submodule_name, model_cfg): + """Adjust any sequence-based layer indices found in a submodule name for Pipeline Parallelism.""" + match = re.search(r"(?<=\.)\d+(?=\.)", submodule_name) + if not match: + return submodule_name + + offset = TransformerLayer._get_layer_offset(model_cfg) + new_layer_idx = int(match.group(0)) - offset + if new_layer_idx < 0: + raise ValueError(f"Layer {submodule_name} does not fall on final PP rank.") + + new_submodule_name = submodule_name.replace(match.group(0), str(new_layer_idx)) + if parallel_state.get_tensor_and_context_parallel_rank() == 0: + logger.info( + f'Distillation: Renamed layer "{submodule_name}" on final PP rank to "{new_submodule_name}"' + ) + return new_submodule_name + + ######################################################## @@ -95,27 +154,17 @@ class BaseLoss(_Loss, metaclass=ABCMeta): """Abstract base class for Megatron distillation losses.""" def __init__( - self, - student_config: TransformerConfig, - teacher_config: TransformerConfig, - projection_layer: bool = False, + self, model_config: "TransformerConfig", projection_layer: nn.Module | None = None ): """Constructor. Args: - student_config: Student's MCore transformer config. - teacher_config: Teacher's MCore transformer config. - projection_layer: If True, create a linear layer to project student tensor to teacher's hidden dim. + model_config: MCore transformer config. + projection_layer: Module which projects student activations to teacher's hidden dim. """ super().__init__() - self._config = student_config - self._tensor_parallel = self._config.tensor_model_parallel_size > 1 - self._sequence_parallel = self._config.sequence_parallel - - if projection_layer: - self._projection = ProjectionLayer(student_config, teacher_config) - else: - self._projection = None + self._config = model_config + self._projection = projection_layer def pre_forward(self, predictions: Tensor, targets: Tensor) -> tuple[Tensor, Tensor]: """Performs projection of student tensor to match teacher's size if necessary.""" @@ -129,23 +178,16 @@ def pre_forward(self, predictions: Tensor, targets: Tensor) -> tuple[Tensor, Ten return predictions, targets - def post_forward(self, loss: Tensor, tp_reduce: bool = False) -> Tensor: + def post_forward( + self, loss: Tensor, tp_reduce: bool = False, is_sequence_parallel: bool = False + ) -> Tensor: """Reshapes tensor from [s, b] to [b, s] for upcoming loss masking.""" loss = loss.transpose(0, 1).contiguous() - return (loss, tp_reduce) + return (loss, tp_reduce, is_sequence_parallel) class MSELoss(BaseLoss): - """Calculates Mean Squared Error loss between two tensors without reducing the sequence dim.""" - - def __init__(self, student_config: TransformerConfig, teacher_config: TransformerConfig): - """Constructor. - - Args: - student_config: Student's MCore transformer config. - teacher_config: Teacher's MCore transformer config. - """ - super().__init__(student_config, teacher_config) + """Calculates MSE loss between two tensors without reducing the sequence dim.""" def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: """Forward function. @@ -159,7 +201,6 @@ def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: """ predictions, targets = self.pre_forward(predictions, targets) - # TP irrelevant since MSE loss gradients are per-input element. loss = F.mse_loss(predictions, targets, reduction="none") loss = loss.sum(dim=-1) @@ -169,22 +210,26 @@ def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: class HiddenStateCosineLoss(BaseLoss): """Calculates Cosine loss between two tensors without reducing the sequence dim. - The tensors are assumed to be intermediate activations, so extra restrictions are in place. + The tensors are assumed to be intermediate activations, with full hidden dimension size. + We recommend only applying this loss to LayerNorm outputs, which have full hidden dim even when TP is used. """ - def __init__(self, student_config: TransformerConfig, teacher_config: TransformerConfig): + def __init__( + self, model_config: "TransformerConfig", projection_layer: nn.Module | None = None + ): """Constructor. Args: - student_config: Student's MCore transformer config. - teacher_config: Teacher's MCore transformer config. + model_config: MCore transformer config. + projection_layer: Module which projects student activations to teacher's hidden dim. """ - super().__init__(student_config, teacher_config, projection_layer=True) + super().__init__(model_config, projection_layer=projection_layer) - if self._tensor_parallel and not self._sequence_parallel: + if self._config.tensor_model_parallel_size > 1: logger.warning( "``HiddenStateCosineLoss`` only works with tensors with full hidden dim. Ensure the " - "tensor inputs meet this requirement or use `--sequence_parallel` if tensor parallel is enabled." + "tensor inputs meet this requirement. We recommend only applying this loss to LayerNorm outputs, " + "which have full hidden dim even when TP is used." ) def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: @@ -207,33 +252,24 @@ def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: ) loss = loss.view(*predictions.shape[:2]) - if self._sequence_parallel: - # Can efficiently gather size [s, b] tensor now for loss-masking purposes. - # TODO(aanoosheh) Reconsider for memory savings by splitting loss mask instead. - loss = gather_from_sequence_parallel_region(loss) - - return self.post_forward(loss) + # NOTE: Tensor sequence length is still split among TP ranks. + return self.post_forward(loss, is_sequence_parallel=self._config.sequence_parallel) class LogitsKLLoss(BaseLoss): """Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim.""" def __init__( - self, - student_config: TransformerConfig, - teacher_config: TransformerConfig, - temperature: float = 1.0, - reverse: bool = False, + self, model_config: "TransformerConfig", temperature: float = 1.0, reverse: bool = False ): """Constructor. Args: - student_config: Student's MCore transformer config. - teacher_config: Teacher's MCore transformer config. + model_config: MCore transformer config. temperature: Divide tensors by this value prior to calculating loss. reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher) """ - super().__init__(student_config, teacher_config) + super().__init__(model_config) self._temperature = temperature self._reverse = reverse @@ -255,21 +291,21 @@ def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: output_student = predictions.float() / self._temperature # Compute local softmax, and the reweight to compute global softmax. - if self._tensor_parallel: + if self._config.tensor_model_parallel_size > 1: # Maximum value along vocab dimension across all GPUs. teacher_logits_max, _ = torch.max(output_teacher, dim=-1) torch.distributed.all_reduce( teacher_logits_max, op=torch.distributed.ReduceOp.MAX, - group=get_tensor_model_parallel_group(), + group=parallel_state.get_tensor_model_parallel_group(), ) output_teacher = output_teacher - teacher_logits_max.unsqueeze(dim=-1) denom_teacher = torch.sum(torch.exp(output_teacher), dim=-1) - # We can't use `gather_from_tensor_model_parallel_region` here since it discards - # gradients from other ranks - we need to all_reduce the gradients as well. + # We can't use standard reduction function here since the computation + # that follows it isn't identical across TP ranks. denom_teacher = all_reduce_autograd( - denom_teacher, group=get_tensor_model_parallel_group() + denom_teacher, group=parallel_state.get_tensor_model_parallel_group() ) # Maximum value along vocab dimension across all GPUs. @@ -277,13 +313,13 @@ def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: torch.distributed.all_reduce( student_logits_max, op=torch.distributed.ReduceOp.MAX, - group=get_tensor_model_parallel_group(), + group=parallel_state.get_tensor_model_parallel_group(), ) output_student = output_student - student_logits_max.unsqueeze(dim=-1).detach() denom_student = torch.sum(torch.exp(output_student), dim=-1) denom_student = all_reduce_autograd( - denom_student, group=get_tensor_model_parallel_group() + denom_student, group=parallel_state.get_tensor_model_parallel_group() ) slen, bsz, sharded_vocab_size = output_student.shape @@ -327,9 +363,6 @@ def forward(self, predictions: Tensor, targets: Tensor) -> Tensor: return self.post_forward(loss, tp_reduce=True) -######################################################## - - class LogitsAndIntermediatesLossBalancer(mtd.DistillationLossBalancer): """LossBalancer implementation for Logit and Intermediate losses. @@ -359,40 +392,38 @@ def forward(self, loss_dict: dict[str, Tensor]) -> Tensor: Aggregate total scalar loss. """ original_loss = loss_dict.pop(mtd.loss_balancers.STUDENT_LOSS_KEY) - for _key, _loss in loss_dict.items(): + for _key in loss_dict: if _key.startswith(LogitsKLLoss.__name__): - logits_loss = _loss # should only be one - intermediate_loss = sum(loss_dict.values()) + logits_key = _key # should only be one + logits_loss = loss_dict.pop(logits_key) + intermediate_loss = sum(loss_dict.values()) / max(len(loss_dict), 1) if intermediate_loss > 0: dynamic_scale = logits_loss.item() / intermediate_loss.item() - intermediate_loss *= dynamic_scale - kd_loss_scale = self._kd_loss_scale / 2.0 + intermediate_loss_scaled = intermediate_loss * dynamic_scale else: - kd_loss_scale = self._kd_loss_scale + intermediate_loss = logits_loss.new_tensor(intermediate_loss) + intermediate_loss_scaled = intermediate_loss if self._skip_original_loss: - kd_loss = logits_loss + intermediate_loss - total_loss = kd_loss + total_loss = logits_loss + intermediate_loss_scaled else: - kd_loss = (logits_loss + intermediate_loss) * kd_loss_scale - dynamic_scale = original_loss.item() / kd_loss.item() - total_loss = original_loss + kd_loss * dynamic_scale - - return total_loss - - -######################################################## + kd_loss = logits_loss + intermediate_loss_scaled + kd_loss *= original_loss.item() / kd_loss.item() + total_loss = original_loss + kd_loss * self._kd_loss_scale + + out_dict = { + "kd_loss": total_loss, + "logits_loss": logits_loss, + "intermediate_loss": intermediate_loss, + } + return out_dict class ProjectionLayer(MegatronModule): """Module to project student layer activations to teacher's size.""" - def __init__( - self, - student_config: TransformerConfig, - teacher_config: TransformerConfig, - ): + def __init__(self, student_config: "TransformerConfig", teacher_config: "TransformerConfig"): """Constructor. Args: @@ -405,6 +436,7 @@ def __init__( else: self._fit = nn.Linear(student_config.hidden_size, teacher_config.hidden_size) self.apply(self._init_weights) + # Attribute below needed to reduce gradients during backward properly. setattr(self._fit.weight, "sequence_parallel", self.config.sequence_parallel) setattr(self._fit.bias, "sequence_parallel", self.config.sequence_parallel) @@ -418,15 +450,10 @@ def forward(self, student_tensor: Tensor): def _init_weights(self, module): """Initialize the weights.""" - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=0.01) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() + if isinstance(module, nn.Linear): + self.config.init_method(module.weight.data) + if module.bias is not None: + module.bias.data.zero_() class _AllReduce(torch.autograd.Function): @@ -447,30 +474,142 @@ def backward(ctx, grad_output): def all_reduce_autograd( tensor, op=torch.distributed.ReduceOp.SUM, group=torch.distributed.group.WORLD ): - """AllReduce with autograd.""" + """Custom all-reduce function. + + Needed instead of other all-reduce functions available when the computation following + the all-reduce call differs per rank. In KL loss, this corresponds to the different numerators. + """ return _AllReduce.apply(op, group, tensor) ######################################################## -def adjust_distillation_model_for_mcore(model: mtd.DistillationModel, distill_cfg: dict[str, Any]): +def adjust_distillation_model_for_mcore( + model: mtd.DistillationModel, distill_cfg: DistillationConfig +): """Extra modifications to ``mtd.DistillationModel`` required for Megatron-Core.""" - # HACK: Hide teacher during `sharded_state_dict` method. - def _sharded_state_dict(self, *args, **kwargs) -> ShardedStateDict: + # Hide teacher during `sharded_state_dict` method. + def _sharded_state_dict(self, *args, **kwargs) -> "ShardedStateDict": with self.hide_teacher_model(): - return self._sharded_state_dict(*args, **kwargs) + return type(self).sharded_state_dict(self, *args, **kwargs) + + model.sharded_state_dict = MethodType(_sharded_state_dict, model) + + # Skip `lm_loss` bypassing it when training if not needed for backprop. + def _compute_student_lm_loss(self, labels, logits) -> Tensor: + if distill_cfg.skip_lm_loss and self.training: + return torch.zeros_like(labels, dtype=logits.dtype) + return type(self).compute_language_model_loss(self, labels, logits) + + model.compute_language_model_loss = MethodType(_compute_student_lm_loss, model) + + # Skip `lm_loss` always for teacher. + def _compute_teacher_lm_loss(self, labels, logits) -> Tensor: + return torch.zeros_like(labels, dtype=logits.dtype) + + model.teacher_model.compute_language_model_loss = MethodType( + _compute_teacher_lm_loss, model.teacher_model + ) + + # HACK: Pipeline-parallel Distillation requires splitting input tensor into student and teacher parts. + def _set_student_input_tensor_shape(self, shapes: list[tuple[int]]): + self._tensor_split_idx = shapes[0][-1] + + def _set_input_tensor(self, input_tensors: list[Tensor]): + teacher_inputs = [ + t[..., self._tensor_split_idx :] if t is not None else t for t in input_tensors + ] + student_inputs = [ + t[..., : self._tensor_split_idx] if t is not None else t for t in input_tensors + ] + type(self).set_input_tensor(self.teacher_model, teacher_inputs) + type(self).set_input_tensor(self, student_inputs) + + model.set_student_input_tensor_shape = MethodType(_set_student_input_tensor_shape, model) + model.set_input_tensor = MethodType(_set_input_tensor, model) + + # HACK: Concatenate output tensors when PP>1 so they can be passed between ranks. + def _forward(self, *args, **kwargs): + if not self.training: + with self.only_student_forward(): + return type(self).forward(self, *args, **kwargs) + + with torch.no_grad(): + self._teacher_model.eval() + teacher_output = self._teacher_model(*args, **kwargs) + with self.only_student_forward(): + student_output = type(self).forward(self, *args, **kwargs) + + if not parallel_state.is_pipeline_last_stage(): + return torch.cat([student_output, teacher_output], dim=-1) + else: + return student_output + + model.forward = MethodType(_forward, model) + + +def get_tensor_shapes_adjust_fn_for_distillation( + model: torch.nn.Module | list[torch.nn.Module], + seq_length: int, + micro_batch_size: int, + decoder_seq_length: int | None = None, + forward_only: bool = False, +) -> Callable | None: + """Return the function to adjust tensor shapes for Distillation in Megatron-Core's forward pass. + + Currently only used during non-interleaved pipelining for Distillation. + Concatenates sizes of student and teacher output tensors for inter-process communication. + """ + if ( + forward_only + or parallel_state.get_pipeline_model_parallel_world_size() == 1 + or parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None + ): + return None + # Unwrap + if isinstance(model, list): + model = model[0] + while hasattr(model, "module"): + model = model.module + if not isinstance(model, mtd.DistillationModel): + return None + + def adjust_tensor_shapes( + recv_tensor_shapes: list[tuple[int, ...]], send_tensor_shapes: list[tuple[int, ...]] + ): + teacher_config = get_model_config(model.teacher_model) + tp_group = parallel_state.get_tensor_model_parallel_group() + cp_group = parallel_state.get_context_parallel_group() + + teacher_recv_tensor_shapes = get_tensor_shapes( + seq_length=seq_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=decoder_seq_length, + config=teacher_config, + tp_group=tp_group, + cp_group=cp_group, + ) + teacher_send_tensor_shapes = get_tensor_shapes( + seq_length=seq_length, + micro_batch_size=micro_batch_size, + decoder_seq_length=decoder_seq_length, + config=teacher_config, + tp_group=tp_group, + cp_group=cp_group, + ) + model.set_student_input_tensor_shape(recv_tensor_shapes) - model._sharded_state_dict = model.sharded_state_dict - model.sharded_state_dict = types.MethodType(_sharded_state_dict, model) + for i, shape in enumerate(recv_tensor_shapes): + shape = list(shape) + shape[-1] += teacher_recv_tensor_shapes[0][-1] # type: ignore[index] + recv_tensor_shapes[i] = tuple(shape) + for i, shape in enumerate(send_tensor_shapes): + shape = list(shape) + shape[-1] += teacher_send_tensor_shapes[0][-1] # type: ignore[index] + send_tensor_shapes[i] = tuple(shape) - # HACK: Skip `lm_loss` bypassing it when training if not needed for backprop. - def _compute_language_model_loss(self, labels, logits) -> Tensor: - if self.training: - return torch.zeros_like(labels) - return self._compute_language_model_loss(labels, logits) + return recv_tensor_shapes, send_tensor_shapes - if distill_cfg["skip_lm_loss"]: - model._compute_language_model_loss = model.compute_language_model_loss - model.compute_language_model_loss = types.MethodType(_compute_language_model_loss, model) + return adjust_tensor_shapes From 62c1c99420ae0c363a8202a5dece12a91adf2de5 Mon Sep 17 00:00:00 2001 From: realAsma <86726418+realAsma@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:45:44 -0700 Subject: [PATCH 11/27] Remove unused utilities for ModelOpt <0.29 MCore checkpoints (#322) Signed-off-by: realAsma Signed-off-by: Ye Yu --- modelopt/torch/quantization/conversion.py | 1 - .../quantization/nn/modules/quant_module.py | 18 -------- .../nn/modules/tensor_quantizer.py | 43 ------------------- modelopt/torch/quantization/plugins/custom.py | 18 -------- modelopt/torch/quantization/plugins/vllm.py | 12 ------ .../torch_quantization/checkpointing.py | 37 ---------------- .../torch_quantization/quantize_common.py | 5 --- .../torch/quantization/test_quantize_cuda.py | 5 --- 8 files changed, 139 deletions(-) delete mode 100644 tests/_test_utils/torch_quantization/checkpointing.py diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 3c5208778..6a457f172 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -198,7 +198,6 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe # REPLACE on the parent (model), not on child quantized = registry.convert(child) setattr(model, name, quantized) - quantized.mopt_ckpt_versn = version # now recurse into whichever module is now at `model.name` _replace_quant_module(getattr(model, name), version=version, registry=registry) diff --git a/modelopt/torch/quantization/nn/modules/quant_module.py b/modelopt/torch/quantization/nn/modules/quant_module.py index 710307c07..93df3651b 100644 --- a/modelopt/torch/quantization/nn/modules/quant_module.py +++ b/modelopt/torch/quantization/nn/modules/quant_module.py @@ -37,24 +37,6 @@ class QuantModule(DynamicModule): """A base class for quantized modules.""" - @property - def mopt_ckpt_versn(self): - """Checkpoint version of the modelopt.""" - for module in self.modules(): - if isinstance(module, TensorQuantizer): - return module.mopt_ckpt_versn - return None - - @mopt_ckpt_versn.setter - def mopt_ckpt_versn(self, version: str): - """Set the checkpoint version for the TensorQuantizer states.""" - - def _set_ckpt_version(module): - if isinstance(module, TensorQuantizer): - module.mopt_ckpt_versn = version - - self.apply(_set_ckpt_version) - def modelopt_post_restore(self, prefix: str = ""): """Post-restore to correctly configure the TensorQuantizer states. diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index b1b2543ac..9846f3554 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -29,7 +29,6 @@ DTensor = None import torch.nn.functional as F -from packaging.version import Version from torch import nn from torch.onnx._globals import GLOBALS @@ -1023,48 +1022,6 @@ def extra_repr(self): s += " calib" if (self._if_calib) else "" return s - @property - def mopt_ckpt_versn(self): - """Version of the checkpoint if it is restored from a checkpoint.""" - return getattr(self, "_mopt_ckpt_versn", None) - - @mopt_ckpt_versn.setter - def mopt_ckpt_versn(self, version: str): - self._mopt_ckpt_versn = str(version) - - def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): - """Special handling for loading older checkpoints. - - This implementation is for backward compatibility and can be deprecated in future versions. - - Args: - state_dict: A dict containing the state of the top level module - prefix: A string that prefixes all of this modules state in state_dict, e.g. 'model.conv1.' - """ - if self.mopt_ckpt_versn is None or Version(self.mopt_ckpt_versn) >= Version("0.29"): - # Warnings below are raised if users use partial state dictionary intentionally (eg:- HF ckpts) - # For ModelOpt >= 0.29, the buffers will be correctly created, So lets skip the warnings - return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) - - _attrs = ["_amax", "_pre_quant_scale", "_svdquant_lora_a", "_svdquant_lora_b"] - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - for attr in _attrs: - has_dst = attr in self._buffers - has_src = prefix + attr in state_dict - - if not has_src and has_dst: - warnings.warn(f"{prefix[:-1]}: No {attr} in state_dict.") - elif has_src and not has_dst: - warnings.warn( - f"{prefix[:-1]}: No '{attr}' buffer to load {attr} into." - f" '{attr}` is created as a buffer for now. Please move the model to the correct device and " - "dtype after this by calling `model.to(device, dtype)`." - ) - self.register_buffer(attr, state_dict[prefix + attr].clone().detach().to(device)) - - super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) - def _get_properties_for_modelopt_state(self): return ( self.__dict__.keys() diff --git a/modelopt/torch/quantization/plugins/custom.py b/modelopt/torch/quantization/plugins/custom.py index 6e89b6668..4227f3c49 100644 --- a/modelopt/torch/quantization/plugins/custom.py +++ b/modelopt/torch/quantization/plugins/custom.py @@ -21,9 +21,7 @@ from types import ModuleType import torch -from packaging.version import Version -from modelopt import __version__ from modelopt.torch.utils.distributed import ParallelState from ..nn import QuantModule, SequentialQuantizer, TensorQuantizer @@ -174,19 +172,3 @@ def _has_state(quantizer, name): max_calibrate(self.output_quantizer, lambda oq: oq(dummy_input), distributed_sync=False) # If there are any other states, lets move them to the correct device super().modelopt_post_restore(prefix=prefix) - - def is_version_less_than(self, version: str) -> bool: - self_version = ( - Version(self.mopt_ckpt_versn) - if self.mopt_ckpt_versn is not None - else Version(__version__) - ) - - # version in NeMo container is 0.0.0 if installed from source without history - if self_version < Version(version) and self_version != Version("0.0.0"): - warnings.warn( - f"Checkpoint version {self_version} is less than {version}. " - "Please re-save model to avoid this warning." - ) - return True - return False diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py index fb606b0d5..f5c10c87e 100644 --- a/modelopt/torch/quantization/plugins/vllm.py +++ b/modelopt/torch/quantization/plugins/vllm.py @@ -185,15 +185,3 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): output = super().forward(hidden_states, router_logits) self.invoke_fused_moe_quantized = self._invoke_fused_moe_quantized return output - - @property - def mopt_ckpt_versn(self): - """Checkpoint version of the modelopt.""" - return None - - @mopt_ckpt_versn.setter - def mopt_ckpt_versn(self, version: str): - """Set the checkpoint version for the TensorQuantizer states.""" - # vLLM defined an apply method that overwrites nn.Module.apply - # To avoid conflicting, disable the apply call here - # self.apply(_set_ckpt_version) diff --git a/tests/_test_utils/torch_quantization/checkpointing.py b/tests/_test_utils/torch_quantization/checkpointing.py deleted file mode 100644 index ec29320e3..000000000 --- a/tests/_test_utils/torch_quantization/checkpointing.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -from packaging.version import Version - - -# Deprecation > 0.29 -def format_modelopt_checkpoint_by_version(modelopt_state: dict, version: str): - if Version(version) >= Version("0.29"): - return modelopt_state - modelopt_state = copy.deepcopy(modelopt_state) - modelopt_state["modelopt_version"] = version - for mode, state in modelopt_state["modelopt_state_dict"]: - if "quantizer_state" not in state["metadata"]: - continue - for quantizer_state in state["metadata"]["quantizer_state"].values(): - quantizer_state["_mopt_ckpt_versn"] = version - pyt_states = quantizer_state.pop("_pytorch_state_metadata", None) - if pyt_states is None: - continue - for k in pyt_states["buffers"]: - quantizer_state["_has" + k] = True - return modelopt_state diff --git a/tests/_test_utils/torch_quantization/quantize_common.py b/tests/_test_utils/torch_quantization/quantize_common.py index 02795099d..505eac2b6 100644 --- a/tests/_test_utils/torch_quantization/quantize_common.py +++ b/tests/_test_utils/torch_quantization/quantize_common.py @@ -26,8 +26,6 @@ from modelopt.torch.quantization.utils import is_quantized_linear from modelopt.torch.utils import torch_to -from .checkpointing import format_modelopt_checkpoint_by_version - INT4_AWQ_FULL_CFG = mtq.INT4_AWQ_CFG.copy() INT4_AWQ_FULL_CFG["algorithm"] = "awq_full" @@ -84,9 +82,6 @@ def save_restore_test(model_cls, device, quant_config, compress=False, version=N state_dict = mto.modelopt_state(model_quant) - if version is not None: - state_dict = format_modelopt_checkpoint_by_version(state_dict, version) - mto.restore_from_modelopt_state(model_ref, state_dict) model_ref.load_state_dict(model_quant.state_dict()) assert torch.allclose(model_quant(calib_data[0]), model_ref(calib_data[0])) diff --git a/tests/gpu/torch/quantization/test_quantize_cuda.py b/tests/gpu/torch/quantization/test_quantize_cuda.py index 95cc02d37..176e6e10b 100644 --- a/tests/gpu/torch/quantization/test_quantize_cuda.py +++ b/tests/gpu/torch/quantization/test_quantize_cuda.py @@ -92,8 +92,3 @@ def test_quantize(model_cls, config): ) def test_save_restore(model_cls, quant_config): save_restore_test(model_cls, "cuda", quant_config) - - -@pytest.mark.parametrize("version", [None, "0.29", "0.28"]) -def test_save_restore_all_versions(version): - save_restore_test(SimpleLinear, "cuda", mtq.INT8_DEFAULT_CFG, version=version) From 6a3edecc28ae8a6ab8cca0b49b388b24e0cc9d05 Mon Sep 17 00:00:00 2001 From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:42:36 -0700 Subject: [PATCH 12/27] Upgrade TensorRT-LLM docker to 1.1.0RC2 (#327) Signed-off-by: Chenjie Luo Signed-off-by: Ye Yu --- CHANGELOG.rst | 1 + docker/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a52771f25..fffda4832 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -23,6 +23,7 @@ Model Optimizer Changelog (Linux) **Bug Fixes** - Fix attention head ranking logic for pruning Megatron Core GPT models. +- Upgrade TensorRT-LLM dependency to 1.1.0rc2. **New Features** diff --git a/docker/Dockerfile b/docker/Dockerfile index 7776af712..3fd96a09f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6 +FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2 ARG PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com" ENV PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL \ From 68abf3e6ca3479c5964090ae716b8f6d551f35af Mon Sep 17 00:00:00 2001 From: realAsma <86726418+realAsma@users.noreply.github.com> Date: Tue, 16 Sep 2025 18:43:44 -0700 Subject: [PATCH 13/27] [1/N] QATTrainer training workflow fixes and clean up; Added backend specific unitests; (#318) Signed-off-by: realAsma Signed-off-by: Ye Yu --- examples/llm_qat/README.md | 1 - .../llm_qat/accelerate_config/deepspeed.yaml | 23 ++ examples/llm_qat/accelerate_config/fsdp1.yaml | 2 +- examples/llm_qat/convert_sharded_ckpt.py | 55 --- examples/llm_qat/launch.sh | 181 ++++------ .../llama_factory/launch_llamafactory.sh | 1 - examples/llm_qat/main.py | 19 +- examples/llm_qat/simple_qat_train.py | 6 +- examples/llm_qat/utils.py | 7 + modelopt/torch/opt/conversion.py | 15 +- modelopt/torch/opt/dynamic.py | 3 +- modelopt/torch/opt/plugins/peft.py | 13 +- .../torch/quantization/calib/histogram.py | 3 +- modelopt/torch/quantization/conversion.py | 6 +- .../quantization/nn/modules/quant_module.py | 6 +- .../plugins/transformers_trainer.py | 320 ++++++++---------- modelopt/torch/quantization/utils.py | 25 +- modelopt/torch/utils/network.py | 30 +- tests/_test_utils/examples/run_command.py | 10 +- tests/examples/llm_qat/test_llm_qat.py | 29 +- 20 files changed, 345 insertions(+), 410 deletions(-) create mode 100644 examples/llm_qat/accelerate_config/deepspeed.yaml delete mode 100644 examples/llm_qat/convert_sharded_ckpt.py diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 522c29919..801f7ee22 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -82,7 +82,6 @@ def forward_loop(model): # Quantize the model in-place; The model should be unwrapped from any distributed wrapper -# The model may be wrapped in a DataParallel or DistributedDataParallel after `mtq.quantize` model = mtq.quantize(model, mtq.INT8_DEFAULT_CFG, forward_loop) # Save the modelopt quantizer states diff --git a/examples/llm_qat/accelerate_config/deepspeed.yaml b/examples/llm_qat/accelerate_config/deepspeed.yaml new file mode 100644 index 000000000..913bb1572 --- /dev/null +++ b/examples/llm_qat/accelerate_config/deepspeed.yaml @@ -0,0 +1,23 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + gradient_clipping: 1.0 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +enable_cpu_affinity: false +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: gpu +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/examples/llm_qat/accelerate_config/fsdp1.yaml b/examples/llm_qat/accelerate_config/fsdp1.yaml index fc80dd358..5e0f5e652 100644 --- a/examples/llm_qat/accelerate_config/fsdp1.yaml +++ b/examples/llm_qat/accelerate_config/fsdp1.yaml @@ -4,7 +4,7 @@ distributed_type: FSDP downcast_bf16: 'no' enable_cpu_affinity: false fsdp_config: - fsdp_activation_checkpointing: false + fsdp_activation_checkpointing: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_backward_prefetch: BACKWARD_PRE fsdp_cpu_ram_efficient_loading: true diff --git a/examples/llm_qat/convert_sharded_ckpt.py b/examples/llm_qat/convert_sharded_ckpt.py deleted file mode 100644 index aa762709a..000000000 --- a/examples/llm_qat/convert_sharded_ckpt.py +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -from transformers import AutoModelForCausalLM - -import modelopt.torch.opt as mto -from modelopt.torch.quantization.plugins.transformers_trainer import ( - convert_sharded_model_to_hf_format, -) - -# Enable ModelOpt checkpointing for HuggingFace models -mto.enable_huggingface_checkpointing() - - -def main(): - parser = argparse.ArgumentParser(description="Convert sharded checkpoint to HuggingFace format") - parser.add_argument( - "--hf_model_path", type=str, required=True, help="Path to the original HuggingFace model" - ) - parser.add_argument( - "--sharded_ckpt_path", - type=str, - required=True, - help="Path to the sharded checkpoint directory", - ) - parser.add_argument( - "--output_path", type=str, default="", help="Output path to save the converted model" - ) - - args = parser.parse_args() - - model = AutoModelForCausalLM.from_pretrained(args.hf_model_path) - if os.path.exists(os.path.join(args.sharded_ckpt_path, "pytorch_model_fsdp_0")): - convert_sharded_model_to_hf_format( - model, args.sharded_ckpt_path, "modelopt_state_train.pth", args.output_path - ) - - -if __name__ == "__main__": - main() diff --git a/examples/llm_qat/launch.sh b/examples/llm_qat/launch.sh index 879db8fd8..5d9fc3a7b 100755 --- a/examples/llm_qat/launch.sh +++ b/examples/llm_qat/launch.sh @@ -18,96 +18,37 @@ set -eo pipefail export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +# Helper function to parse a single argument value +parse_value() { + if [[ "$1" != *=* ]]; then shift; fi + echo "${1#*=}" +} + while [ $# -gt 0 ]; do case "$1" in - --model*) - if [[ "$1" != *=* ]]; then shift; fi - MODEL="${1#*=}" - ;; - --output_dir*) - if [[ "$1" != *=* ]]; then shift; fi - OUTPUT_DIR="${1#*=}" - ;; - --dataset*) - if [[ "$1" != *=* ]]; then shift; fi - DATASET="${1#*=}" - ;; - --train_size*) - if [[ "$1" != *=* ]]; then shift; fi - TRAIN_SIZE="${1#*=}" - ;; - --eval_size*) - if [[ "$1" != *=* ]]; then shift; fi - EVAL_SIZE="${1#*=}" - ;; - --num_epochs*) - if [[ "$1" != *=* ]]; then shift; fi - NUM_EPOCHS="${1#*=}" - ;; - --max_steps*) - if [[ "$1" != *=* ]]; then shift; fi - MAX_STEPS="${1#*=}" - ;; - --save_steps*) - if [[ "$1" != *=* ]]; then shift; fi - SAVE_STEPS="${1#*=}" - ;; - --accum_steps*) - if [[ "$1" != *=* ]]; then shift; fi - ACCUM_STEPS="${1#*=}" - ;; - --lr*) - if [[ "$1" != *=* ]]; then shift; fi - LR="${1#*=}" - ;; - --quant_cfg*) - if [[ "$1" != *=* ]]; then shift; fi - QUANT_CFG="${1#*=}" - ;; - --compress*) - if [[ "$1" != *=* ]]; then shift; fi - COMPRESS="${1#*=}" - ;; - --calib_size*) - if [[ "$1" != *=* ]]; then shift; fi - CALIB_SIZE="${1#*=}" - ;; - --train_bs*) - if [[ "$1" != *=* ]]; then shift; fi - TRAIN_BS="${1#*=}" - ;; - --eval_bs*) - if [[ "$1" != *=* ]]; then shift; fi - EVAL_BS="${1#*=}" - ;; - --do_train*) - if [[ "$1" != *=* ]]; then shift; fi - DO_TRAIN="${1#*=}" - ;; - --lora*) - if [[ "$1" != *=* ]]; then shift; fi - LORA="${1#*=}" - ;; - --teacher_model*) - if [[ "$1" != *=* ]]; then shift; fi - TEACHER_MODEL="${1#*=}" - ;; - --distill*) - if [[ "$1" != *=* ]]; then shift; fi - DISTILL="${1#*=}" - ;; - --fsdp_transformer_layer_cls_to_wrap*) - if [[ "$1" != *=* ]]; then shift; fi - FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP="${1#*=}" - ;; - --use_fsdp2*) - if [[ "$1" != *=* ]]; then shift; fi - USE_FSDP2="${1#*=}" - ;; - --max_seq_length*) - if [[ "$1" != *=* ]]; then shift; fi - MAX_SEQ_LENGTH="${1#*=}" - ;; + --model*) MODEL=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --output_dir*) OUTPUT_DIR=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --dataset*) DATASET=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --train_size*) TRAIN_SIZE=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --eval_size*) EVAL_SIZE=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --num_epochs*) NUM_EPOCHS=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --max_steps*) MAX_STEPS=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --save_steps*) SAVE_STEPS=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --accum_steps*) ACCUM_STEPS=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --lr*) LR=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --quant_cfg*) QUANT_CFG=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --compress*) COMPRESS=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --calib_size*) CALIB_SIZE=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --train_bs*) TRAIN_BS=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --eval_bs*) EVAL_BS=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --do_train*) DO_TRAIN=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --lora*) LORA=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --teacher_model*) TEACHER_MODEL=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --distill*) DISTILL=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --fsdp_transformer_layer_cls_to_wrap*) FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --max_seq_length*) MAX_SEQ_LENGTH=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --backend*) BACKEND=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; + --use_fsdp2*) USE_FSDP2=$(parse_value "$@"); [[ "$1" != *=* ]] && shift ;; *) >&2 printf "Error: Invalid argument ${1#*=}\n" exit 1 @@ -142,6 +83,7 @@ COMPRESS=${COMPRESS:-"False"} DISTILL=${DISTILL:-"False"} TEACHER_MODEL=${TEACHER_MODEL:-$MODEL} FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP=${FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP:-"LlamaDecoderLayer"} +BACKEND=${BACKEND:-"fsdp1"} if [ -z $QUANT_CFG ]; then QUANT_ARGS="" @@ -154,31 +96,55 @@ if [ ! -z $MAX_STEPS ]; then OPTIONAL_ARGS="$OPTIONAL_ARGS --max_steps $MAX_STEPS" fi -CONFIG_FILE="fsdp1.yaml" -FSDP_ARGS="--fsdp_transformer_layer_cls_to_wrap $FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP" -GRADIENT_CHECKPOINTING_ARGS="--gradient_checkpointing True" - +# Set backend based on --backend parameter, with backward compatibility for --use_fsdp2 if [[ "${USE_FSDP2,,}" == "true" ]]; then - echo "Using FSDP2 instead of FSDP1. FSDP2 is not mature yet! Please use it with latest torch and transformers." - CONFIG_FILE="fsdp2.yaml" - GRADIENT_CHECKPOINTING_ARGS="" + echo "Warning: --use_fsdp2 is deprecated. Use --backend=fsdp2 instead." + BACKEND="fsdp2" +fi + +# if compress is true, set backend to ddp +if [[ "${COMPRESS,,}" == "true" ]]; then + BACKEND="ddp" fi +# Configure backend-specific settings +GRADIENT_CHECKPOINTING_ARGS="" +case "${BACKEND,,}" in + "fsdp1"|"fsdp") + CONFIG_FILE="fsdp1.yaml" + FSDP_ARGS="--fsdp_transformer_layer_cls_to_wrap $FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP" + ;; + "fsdp2") + echo "Using FSDP2 instead of FSDP1. FSDP2 is not mature yet! Please use it with latest torch and transformers." + CONFIG_FILE="fsdp2.yaml" + FSDP_ARGS="--fsdp_transformer_layer_cls_to_wrap $FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP" + ;; + "ddp") + CONFIG_FILE="ddp.yaml" + FSDP_ARGS="" + GRADIENT_CHECKPOINTING_ARGS="--gradient_checkpointing True" + ;; + "deepspeed") + CONFIG_FILE="deepspeed.yaml" + FSDP_ARGS="" + GRADIENT_CHECKPOINTING_ARGS="--gradient_checkpointing True" + ;; + *) + echo "Error: Invalid backend '$BACKEND'. Supported backends: fsdp1, fsdp2, ddp, deepspeed" + exit 1 + ;; +esac + +# TODO: Remove this after simple distillation is supported DISTILLATION_ARGS="" if [[ "${DISTILL,,}" == "true" ]]; then DISTILLATION_ARGS="--distill $DISTILL --teacher_model $TEACHER_MODEL" - # Distillation does not work with memory efficient loading - FSDP_ARGS="$FSDP_ARGS --fsdp_cpu_ram_efficient_loading False" + # Distillation does not work with memory efficient loading for FSDP + if [[ "${BACKEND,,}" == "fsdp1" || "${BACKEND,,}" == "fsdp2" ]]; then + FSDP_ARGS="$FSDP_ARGS --fsdp_cpu_ram_efficient_loading False" + fi fi -# real quantization does not work with FSDP, only works with FSDP2 -if [[ "${COMPRESS,,}" == "true" && "${USE_FSDP2,,}" != "true" ]]; then - echo "Compression is not supported with FSDP. Disabling FSDP and using DDP." - FSDP_ARGS="" - CONFIG_FILE="ddp.yaml" -fi - - CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \ main.py \ --model_name_or_path $MODEL \ @@ -209,10 +175,9 @@ CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \ --report_to tensorboard \ --lora $LORA \ --compress $COMPRESS \ - $QUANT_ARGS $OPTIONAL_ARGS $GRADIENT_CHECKPOINTING_ARGS $DISTILLATION_ARGS + $GRADIENT_CHECKPOINTING_ARGS $QUANT_ARGS $OPTIONAL_ARGS $DISTILLATION_ARGS " start_time=$(date +%s) sh -c "$CMD" -echo "Total time taken: $(( $(date +%s) - $start_time )) seconds" -python convert_sharded_ckpt.py --hf_model_path $MODEL --sharded_ckpt_path $OUTPUT_DIR --output_path $OUTPUT_DIR +echo "Total time taken: $(( $(date +%s) - $start_time )) seconds" \ No newline at end of file diff --git a/examples/llm_qat/llama_factory/launch_llamafactory.sh b/examples/llm_qat/llama_factory/launch_llamafactory.sh index 49551802d..23e06f26a 100644 --- a/examples/llm_qat/llama_factory/launch_llamafactory.sh +++ b/examples/llm_qat/llama_factory/launch_llamafactory.sh @@ -256,4 +256,3 @@ else echo "Modified FSDP args: $FSDP_ARGS" accelerate launch --config_file $ACCELERATE_CONFIG $FSDP_ARGS $SCRIPT_DIR/llama_factory.py $CONFIG_FILE fi -python $SCRIPT_DIR/../convert_sharded_ckpt.py --hf_model_path $MODEL --sharded_ckpt_path $OUTPUT_DIR --output_path $OUTPUT_DIR diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index b711b9bb7..30f49a6a5 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -38,6 +38,7 @@ from transformers.trainer_utils import get_last_checkpoint from utils import ( get_lora_config, + get_metrics_with_perplexity, make_supervised_data_module, monkey_patch_training_step_to_fix_memory_leak, ) @@ -45,11 +46,7 @@ import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq from modelopt.torch.distill.plugins.huggingface import LMLogitsLoss -from modelopt.torch.quantization.plugins.transformers_trainer import ( - QADTrainer, - QATTrainer, - get_metrics_with_perplexity, -) +from modelopt.torch.quantization.plugins.transformers_trainer import QADTrainer, QATTrainer from modelopt.torch.utils import print_rank_0 # Enable automatic save/load of modelopt state huggingface checkpointing @@ -263,16 +260,12 @@ def train(): if training_args.do_train: trainer.train(resume_from_checkpoint=checkpoint) + print_rank_0("Training completed.") if training_args.do_eval: - if not training_args.do_train: - # trainer.evaluate() will not prepare the model properly, especially for FSDP2, - # so we use the ``eval_on_start`` flag to evaluate the model and skip the training. - trainer.train(resume_from_checkpoint=checkpoint, eval_only=True) - else: - metrics = trainer.evaluate() - metrics = get_metrics_with_perplexity(metrics) - print_rank_0(f"Evaluation results: \n{metrics}") + metrics = trainer.evaluate() + metrics = get_metrics_with_perplexity(metrics) + print_rank_0(f"Evaluation results: \n{metrics}") if training_args.do_train or quant_args.quant_cfg is not None: print_rank_0("Saving the model...") diff --git a/examples/llm_qat/simple_qat_train.py b/examples/llm_qat/simple_qat_train.py index 367958028..853102784 100644 --- a/examples/llm_qat/simple_qat_train.py +++ b/examples/llm_qat/simple_qat_train.py @@ -74,7 +74,7 @@ def train(model, optimizer, train_dataloader, tokenizer, epochs, output_dir, dev def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="CNN QAT using ModelOpt") + parser = argparse.ArgumentParser(description="QAT Training Script") # Data paths parser.add_argument("--model-path", type=str, required=True, help="Path to the model") parser.add_argument("--train-size", type=int, default=2048, help="Train size") @@ -87,7 +87,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--quant-cfg", type=str, - default=mtq.NVFP4_DEFAULT_CFG, + default="NVFP4_DEFAULT_CFG", choices=mtq.config.choices, help="Quantization configuration", ) @@ -121,7 +121,7 @@ def calibrate(m: nn.Module): m(batch["input_ids"].to(device)) # Quantize the model - model = mtq.quantize(model, args.quant_cfg, calibrate) + model = mtq.quantize(model, getattr(mtq, args.quant_cfg), calibrate) # Initialize optimizer optimizer = AdamW(model.parameters(), lr=args.lr) diff --git a/examples/llm_qat/utils.py b/examples/llm_qat/utils.py index ac4a544a6..bb70bdf12 100644 --- a/examples/llm_qat/utils.py +++ b/examples/llm_qat/utils.py @@ -167,3 +167,10 @@ def new_func(original_f_name, trainer, *args, **kwargs): setattr( trainer, f_name, types.MethodType(partial(new_func, "_original_" + f_name), trainer) ) + + +def get_metrics_with_perplexity(metrics): + """Add perplexity to the metrics.""" + if "eval_loss" in metrics: + metrics["perplexity"] = float(torch.exp(torch.tensor(metrics["eval_loss"]))) + return metrics diff --git a/modelopt/torch/opt/conversion.py b/modelopt/torch/opt/conversion.py index 468c36e8c..183514f9e 100644 --- a/modelopt/torch/opt/conversion.py +++ b/modelopt/torch/opt/conversion.py @@ -380,7 +380,7 @@ def apply_mode( return model.init_modellike() if isinstance(model, ModelLikeModule) else model # check if the model is in a wrapper - model = unwrap_model(model, raise_error=True) + model = unwrap_model(model, force_unwrap=True) # standardize mode to ModeConfigList mode_and_config = get_mode_config(mode) @@ -493,10 +493,6 @@ def save(model: nn.Module, f: str | os.PathLike | BinaryIO, **kwargs) -> None: model: Any model. f: Target file location. **kwargs: additional args for ``torch.save()``. - - .. note:: - - If model is a wrapper such as DistributedDataParallel, it will be unwrapped for saving. """ # unwrap model model = unwrap_model(model, warn=True) @@ -545,11 +541,6 @@ def restore_from_modelopt_state(model: ModelLike, modelopt_state: dict[str, Any] Returns: A modified model architecture based on the restored modifications with the unmodified weights as stored in the provided ``model`` argument. - - .. note:: - - Note that wrappers such as DistributedDataParallel are `not` supported during the restore - process. Please wrap the model after the restore process. """ # initialize ModelLikeModule if needed. model = model if isinstance(model, nn.Module) else ModelLikeModule(model) @@ -590,13 +581,15 @@ def restore(model: ModelLike, f: str | os.PathLike | BinaryIO, **kwargs) -> nn.M The model with original weights and stored architecture. .. note:: - Note that wrappers such as DistributedDataParallel are `not` supported during the restore process. Please wrap the model after the restore process. """ # initialize ModelLikeModule if needed. model = model if isinstance(model, nn.Module) else ModelLikeModule(model) + # check if the model is in a wrapper; we dont support restoring with wrappers + model = unwrap_model(model, raise_error=True) + # load checkpoint kwargs.setdefault("map_location", "cpu") kwargs.setdefault("weights_only", False) diff --git a/modelopt/torch/opt/dynamic.py b/modelopt/torch/opt/dynamic.py index 533b1f052..ac4143673 100644 --- a/modelopt/torch/opt/dynamic.py +++ b/modelopt/torch/opt/dynamic.py @@ -1273,7 +1273,8 @@ def config(self, configurable: bool | None = None) -> dict[str, Any]: A dict of ``(parameter_name, choice)`` that specifies an active subnet. """ return { - get_unwrapped_name(name): hp.active for name, hp in self.named_hparams(configurable) + get_unwrapped_name(name, self.model): hp.active + for name, hp in self.named_hparams(configurable) } def select(self, config: dict[str, Any], strict: bool = True) -> None: diff --git a/modelopt/torch/opt/plugins/peft.py b/modelopt/torch/opt/plugins/peft.py index 55855d50f..5e5ed0f93 100644 --- a/modelopt/torch/opt/plugins/peft.py +++ b/modelopt/torch/opt/plugins/peft.py @@ -57,14 +57,9 @@ def _new_save_pretrained_peft(self, save_directory, *args, **kwargs): # So we need to save the quantizer state_dict separately # TODO: Move this to modelopt.torch.quantization.plugins.peft - from modelopt.torch.quantization.nn import TensorQuantizer - - # We should not call self/model.state_dict() here. HF Trainer calls model.save_pretrained() only from process 0 - # With FSDP, model.state_dict() will hang if it is not called from all processes - quantizer_state_dict = {} - for name, module in self.named_modules(): - if isinstance(module, TensorQuantizer): - quantizer_state_dict[get_unwrapped_name(name)] = module.state_dict() + from modelopt.torch.quantization.utils import get_quantizer_state_dict + + quantizer_state_dict = get_quantizer_state_dict(self) if len(quantizer_state_dict) > 0: torch.save(quantizer_state_dict, _get_quantizer_state_save_path(save_directory)) return outputs @@ -95,7 +90,7 @@ def _new_load_adapter(self, model_id, adapter_name, *args, **kwargs): ) for name, module in self.named_modules(): if isinstance(module, TensorQuantizer): - module.load_state_dict(quantizer_state_dict[get_unwrapped_name(name)]) + module.load_state_dict(quantizer_state_dict[get_unwrapped_name(name, self)]) return outputs diff --git a/modelopt/torch/quantization/calib/histogram.py b/modelopt/torch/quantization/calib/histogram.py index d0a13bc2a..e27a5471e 100644 --- a/modelopt/torch/quantization/calib/histogram.py +++ b/modelopt/torch/quantization/calib/histogram.py @@ -157,8 +157,7 @@ def compute_amax( """ if dist.is_initialized(): warnings.warn( - "This method does not perform any synchronization across DistributedDataParallel" - " (DDP) https://pytorch.org/docs/stable/notes/ddp.html modules. The recommended" + "This method does not perform any synchronization across distributed processes. The recommended" " method is to use the same calibration dataset across all distributed data" " parallel groups so that `amax` is the same for all DDP modules." ) diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index 6a457f172..7c2f84b8b 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -123,12 +123,12 @@ def restore_quantizer_state(model: nn.Module, config: QuantizeConfig, metadata: for name, module in model.named_modules(): if isinstance(module, TensorQuantizer): - name = get_unwrapped_name(name) + name = get_unwrapped_name(name, model) module.set_from_modelopt_state(quantizer_state_dict[name]) for name, module in model.named_modules(): if isinstance(module, QuantModule): - name = get_unwrapped_name(name) + name = get_unwrapped_name(name, model) module.modelopt_post_restore(name) return model @@ -166,7 +166,7 @@ def update_quantize_metadata( def quantizer_state(model: nn.Module) -> dict[str, Any]: """Returns the quantizer state dict describing the quantizer states in the model.""" return { - get_unwrapped_name(n): m.get_modelopt_state() + get_unwrapped_name(n, model): m.get_modelopt_state() for n, m in model.named_modules() if isinstance(m, (TensorQuantizer, SequentialQuantizer)) } diff --git a/modelopt/torch/quantization/nn/modules/quant_module.py b/modelopt/torch/quantization/nn/modules/quant_module.py index 93df3651b..12aaee3f8 100644 --- a/modelopt/torch/quantization/nn/modules/quant_module.py +++ b/modelopt/torch/quantization/nn/modules/quant_module.py @@ -140,8 +140,10 @@ class QuantLinearConvBase(QuantInputBase): def quantize_weight(self): """Context in which `self.weight` is quantized.""" self._enable_weight_quantization = True - yield - self._enable_weight_quantization = False + try: + yield + finally: + self._enable_weight_quantization = False @staticmethod def _get_quantized_weight(module: "QuantLinearConvBase", weight: torch.Tensor) -> torch.Tensor: diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 017d8160d..e6a0a2b7d 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -17,12 +17,11 @@ import gc import os -from contextlib import suppress +import types from dataclasses import dataclass, field import torch -import torch.distributed.checkpoint as dist_cp -from accelerate.utils import save_fsdp_model +from tqdm import tqdm import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq @@ -32,10 +31,13 @@ from modelopt.torch.opt.conversion import restore_from_modelopt_state from modelopt.torch.opt.plugins import ModelOptHFTrainer from modelopt.torch.quantization.config import QuantizeConfig +from modelopt.torch.quantization.nn import TensorQuantizer from modelopt.torch.quantization.utils import ( calibrate_with_adapters, disable_lora_quantizers_in_config, + get_quantizer_state_dict, is_quantized, + set_quantizer_state_dict, ) from modelopt.torch.utils import print_rank_0 @@ -98,10 +100,6 @@ class QuantizationArgumentsWithConfig(QuantizationArguments): ) -class EvalOnlyError(Exception): - """Exception to raise when evaluation is only needed.""" - - def check_awq_smoothquant(quant_cfg): # TODO: Remove this once deepspeed for AWQ and SmoothQuant is added """Get the quantization type from the configuration.""" @@ -116,54 +114,6 @@ def check_awq_smoothquant(quant_cfg): return is_awq_smoothquant -def get_metrics_with_perplexity(metrics): - """Add perplexity to the metrics.""" - metrics = {"perplexity": float(torch.exp(torch.tensor(metrics["eval_loss"]))), **metrics} - return metrics - - -def convert_sharded_model_to_hf_format( - model, model_path, modelopt_state_name="modelopt_state.pth", output_path=None -): - """Convert a sharded model to HF format. - - Args: - model: The original HF model. - model_path: The path to the sharded model with pytorch_model_fsdp_0 directory. - modelopt_state_name: The name of the modelopt state file. If not provided, the default name - "modelopt_state.pth" will be used. - output_path: The path to save the converted model. If not provided, the model will be saved - to the same directory as the sharded model. - """ - if output_path is None: - output_path = model_path - os.makedirs(output_path, exist_ok=True) - state_dict = {"model": model.state_dict()} - sharded_model_path = os.path.join(model_path, "pytorch_model_fsdp_0") - modelopt_state_path = os.path.join(model_path, modelopt_state_name) - if not os.path.exists(sharded_model_path): - print_rank_0(f"Sharded model path does not exist: {sharded_model_path}") - return model - dist_cp.load_state_dict( - state_dict=state_dict, - storage_reader=dist_cp.FileSystemReader(sharded_model_path), - no_dist=True, - ) - model.load_state_dict(state_dict["model"]) - restore_modelopt_state_with_weights(model, modelopt_state_path) - model.save_pretrained(output_path) - return model - - -def restore_modelopt_state_with_weights(model, modelopt_state_path): - """Restore the modelopt weights for fsdp2 models.""" - _modelopt_state = torch.load(modelopt_state_path, weights_only=False) - modelopt_weights = _modelopt_state.pop("modelopt_state_weights", None) - restore_from_modelopt_state(model, _modelopt_state) - if modelopt_weights is not None: - model.load_state_dict(modelopt_weights, strict=False) - - class QATTrainer(ModelOptHFTrainer): """A drop-in replacement of HuggingFace's Trainer for quantization aware training with ModelOpt. @@ -190,15 +140,6 @@ def __init__( else quant_args.quant_cfg ) self.quant_cfg = quant_cfg - self._eval_without_training = False - - self._is_fsdp2 = self.is_fsdp_enabled and ( - getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2 - ) - self.fsdp_state_dict_type = ( - str(self.accelerator.state.fsdp_plugin.state_dict_type) if self.is_fsdp_enabled else "" - ) - self._modelopt_state_path = os.path.join(self.args.output_dir, "modelopt_state_train.pth") # Add lora adapter before quantizing the model if getattr(self.args, "lora_config", None) is not None and not hasattr( @@ -219,144 +160,159 @@ def __init__( f"QAT DeepSpeed does not currently support AWQ or SmoothQuant: {self.quant_cfg}" ) - # FSDP1 requires pre-restoring the quantized model if the modelopt state exists. - if os.path.exists(self._modelopt_state_path) and not self._is_fsdp2: - self._quantize_model() - - def _get_quantize_forward_loop(self, data_loader, use_eval_loop=True): - def forward_loop(_model): - print_rank_0("Calibrating...") - if use_eval_loop: - return self.evaluation_loop( - data_loader, - description="Calibration", - prediction_loss_only=True, - ignore_keys=None, - metric_key_prefix="calibration", - ) - else: - for batch in data_loader: - batch = self._prepare_inputs(batch) - _model(**batch) - print_rank_0("Calibration done!") + self._patch_accelerate_for_fsdp2_fix() - return forward_loop + self._modelopt_state_path = os.path.join(self.args.output_dir, "modelopt_state_train.pth") + if os.path.exists(self._modelopt_state_path): + self._restore_modelopt_state_with_weights() + elif is_quantized(self.model): + self._save_modelopt_state_with_weights() - def _save_modelopt_state_with_weights(self, model, modelopt_state_path, save_weights=False): + def _save_modelopt_state_with_weights(self): """Save the modelopt weights for fsdp2 models.""" - modelopt_state = mto.modelopt_state(model) + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + modelopt_state = mto.modelopt_state(self.model) + # TODO: remove this from ModelOpt HF Trainer flows modelopt_state["modelopt_state_dict"] = [ state for state in modelopt_state["modelopt_state_dict"] if "kd_loss" not in state and "export_student" not in state ] - if save_weights: - state_dict = model.state_dict() - modelopt_weights = {} - for k, v in state_dict.items(): - if "_quantizer" in k: - modelopt_weights[k] = v.cpu() - modelopt_state["modelopt_state_weights"] = modelopt_weights + modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model) if self.args.should_save: - torch.save(modelopt_state, modelopt_state_path) + torch.save(modelopt_state, self._modelopt_state_path) - if torch.distributed.is_initialized(): - torch.distributed.barrier() + print_rank_0(f"Saved modelopt state to {self._modelopt_state_path}") - def _quantize_model(self, use_eval_loop=True): + def _restore_modelopt_state_with_weights(self): + modelopt_state = torch.load(self._modelopt_state_path, weights_only=False) + modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) + restore_from_modelopt_state(self.model, modelopt_state) + if modelopt_weights is not None: + set_quantizer_state_dict(self.model, modelopt_weights) + print_rank_0("Restored modelopt state with weights.") + + def _quantize_model(self): """Quantize the model. Restore the quantization state if it exists.""" - model = self.accelerator.unwrap_model(self.model) - if os.path.exists(self._modelopt_state_path): - print_rank_0(f"Restoring modelopt state from {self._modelopt_state_path}...") - restore_modelopt_state_with_weights(self.model, self._modelopt_state_path) - print_rank_0("Restored model from modelopt state.") - else: - dataset = torch.utils.data.Subset( - self.eval_dataset, - list(range(min(self.quant_args.calib_size, len(self.eval_dataset)))), # type: ignore [union-attr] - ) - data_loader = self.get_eval_dataloader(dataset) - forward_loop = self._get_quantize_forward_loop(data_loader, use_eval_loop) - with calibrate_with_adapters(model, self.args): - print_rank_0("Quantizing the model...") - mtq.quantize(model, self.quant_cfg, forward_loop) # type: ignore [arg-type] - print_rank_0("Quantization done!") - - if getattr(self.quant_args, "compress", False): - print_rank_0("Compressing model after calibration") - mtq.compress(model) - - # Force garbage collection to free up memory - gc.collect() - - print_rank_0(f"Saving modelopt state to {self._modelopt_state_path}") - self._save_modelopt_state_with_weights( - model, self._modelopt_state_path, save_weights=True - ) - torch.cuda.empty_cache() - if use_eval_loop: - self.callback_handler.on_evaluate(self, self.state, self.control, metrics=None) + dataset = self.train_dataset if self.train_dataset is not None else self.eval_dataset + assert dataset is not None, "Calibration requires either eval or train dataset." + num_samples = min(self.quant_args.calib_size, len(dataset)) # type: ignore [union-attr] + dataset = torch.utils.data.Subset(dataset, list(range(num_samples))) + data_loader = self.get_eval_dataloader(dataset) + + def forward_loop(model): + for batch in tqdm(data_loader, desc="Calibrating", disable=not self.args.should_save): + batch = self._prepare_inputs(batch) + # Important: We should forward pass using the unwrapped model + # mtq.quantize will unwrap the model & pass to the forward_loop + self.model(**batch) + + # TODO: Remove calibrate_with_adapters - this should not be needed + with calibrate_with_adapters(self.model, self.args): + print_rank_0("Quantizing the model...") + mtq.quantize(self.model, self.quant_cfg, forward_loop) # type: ignore [arg-type] + + if getattr(self.quant_args, "compress", False): + print_rank_0("Compressing model after calibration") + mtq.compress(self.model) + + # Force garbage collection to free up memory + gc.collect() + + self._save_modelopt_state_with_weights() + torch.cuda.empty_cache() if self.accelerator.is_main_process: - mtq.print_quant_summary(model) + mtq.print_quant_summary(self.model) - def _evaluate(self, *args, **kwargs): - """Quantize the model before evaluation. + def training_step(self, *args, **kwargs): + """Training step.""" + if self.quant_cfg is not None and not is_quantized(self.model): + self._quantize_model() + return super().training_step(*args, **kwargs) - Note that we do not force to run the evaluation if the `eval_on_start` is False. - """ + def prediction_step(self, *args, **kwargs): + """Prediction step.""" if self.quant_cfg is not None and not is_quantized(self.model): self._quantize_model() - metrics = None - if self._original_evaluate_on_start: - metrics = super()._evaluate(*args, **kwargs) - else: - metrics = super()._evaluate(*args, **kwargs) - # used for eval without training - if self._eval_without_training: - metrics = get_metrics_with_perplexity(metrics) - print_rank_0(f"Evaluation results: \n{metrics}") - raise EvalOnlyError() - return metrics - - def train(self, *args, eval_only=False, **kwargs): - """Train the model with quantization.""" - self._eval_without_training = eval_only - self._original_evaluate_on_start = ( - self.args.eval_on_start if not self._eval_without_training else True + return super().prediction_step(*args, **kwargs) + + def evaluate(self, *args, **kwargs): + """Evaluate the model.""" + if self.args.do_eval and not self.args.do_train and self.accelerator.is_fsdp2: + # [Not related to ModelOpt] HF does not support eval only for FSDP2. + # This is a hack to make it work + dummy_optimizer = torch.optim.SGD([next(self.model.parameters())], lr=0.0) + self.model, _ = self.accelerator.prepare(self.model, dummy_optimizer) + return super().evaluate(*args, **kwargs) + + def train(self, *args, **kwargs): + """Train the model.""" + outputs = super().train(*args, **kwargs) + print_rank_0( + "Training completed. Please save the final model using `Trainer.save_model()` " + "to preserve ModelOpt states." ) - if getattr(self.quant_args, "quant_cfg", None) is not None and not is_quantized(self.model): - self.args.eval_on_start = True - train_result = None - with suppress(EvalOnlyError): - train_result = super().train(*args, **kwargs) - self.args.eval_on_start = self._original_evaluate_on_start - return train_result + return outputs - def save_model( - self, output_dir: str | None = None, _internal_call: bool = False, *args, **kwargs - ): + def save_model(self, *args, **kwargs): """Save the quantized model.""" - dict_type = ( - str(self.accelerator.state.fsdp_plugin.state_dict_type) if self.is_fsdp_enabled else "" - ) - if not _internal_call and self.is_fsdp_enabled and "SHARDED_STATE_DICT" in dict_type: - # The default save_model in Trainer doesn't save checkpoint with SHARDED_STATE_DICT + FSDP. - # We save the model manually at the end of the training in order to convert the last - # checkpoint from distcp to HF compatible format. - if output_dir is None: - output_dir = self.args.output_dir - save_fsdp_model( - self.accelerator.state.fsdp_plugin, - self.accelerator, - self.model, - output_dir, - ) - self.processing_class.save_pretrained(output_dir) - self.model.config.save_pretrained(output_dir) + if ( + (not self.is_in_train) + and self.is_fsdp_enabled + and self.accelerator.state.fsdp_plugin.state_dict_type != "FULL_STATE_DICT" + ): + print_rank_0("Setting state_dict_type to FULL_STATE_DICT for final checkpoint save.") + original_type = self.accelerator.state.fsdp_plugin.state_dict_type + self.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") + outputs = super().save_model(*args, **kwargs) + if torch.distributed.is_initialized(): + torch.distributed.barrier() + if mto.ModeloptStateManager.is_converted(self.accelerator.unwrap_model(self.model)): + print_rank_0( + "Model saved. To restore, call mto.enable_huggingface_checkpointing() first before loading the " + "model. See https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.opt.plugins.huggingface.html#modelopt.torch.opt.plugins.huggingface.enable_huggingface_checkpointing" + ) + self.accelerator.state.fsdp_plugin.set_state_dict_type(original_type) else: - super().save_model(output_dir, _internal_call, *args, **kwargs) + outputs = super().save_model(*args, **kwargs) + return outputs + + def _patch_accelerate_for_fsdp2_fix(self): + """Fixes for accelerate prepare. + + Accelerate fsdp2 prepare assumes that all parameters and buffers are sharded. This assumption + is causing issues with quantized models since quantization modules adds buffers which are not sharded. + This patch hides the buffers added by quantization modules from the original accelerate prepare. + """ + + def _modelopt_prepare(self, *args, **kwargs): + if not self.is_fsdp2: + return self._original_prepare(*args, **kwargs) + + model = next((obj for obj in args if isinstance(obj, torch.nn.Module)), None) + if model is None: + return self._original_prepare(*args, **kwargs) + + tq_og_non_prsist_buffers = {} + for tq in (m for m in model.modules() if isinstance(m, TensorQuantizer)): + tq.to_empty(device=self.device) + tq_og_non_prsist_buffers[tq] = tq._non_persistent_buffers_set.copy() + tq._non_persistent_buffers_set.update(tq._buffers.keys()) + + outputs = self._original_prepare(*args, **kwargs) + + for tq in (m for m in model.modules() if isinstance(m, TensorQuantizer)): + tq._non_persistent_buffers_set.clear() + tq._non_persistent_buffers_set.update(tq_og_non_prsist_buffers[tq]) + + return outputs + + self.accelerator._original_prepare = self.accelerator.prepare + self.accelerator.prepare = types.MethodType(_modelopt_prepare, self.accelerator) class QADTrainer(QATTrainer, KDTrainer): @@ -385,7 +341,7 @@ def __init__( # And memory efficient loading doesn't work. self.model.cuda() if self.quant_cfg is not None and not is_quantized(self.model): - self._quantize_model(use_eval_loop=False) + self._quantize_model() if getattr(self.args, "lora_config", None) is not None: self.model.add_adapter(self.args.lora_config, adapter_name="adapter") print_rank_0("Lora adapter added.") @@ -416,7 +372,9 @@ def save_model( output_dir: The directory to save the model and ModelOpt states. export_student: Whether to export the student model. """ - if "SHARDED_STATE_DICT" in self.fsdp_state_dict_type and self._is_fsdp2: + if self.accelerator.is_fsdp2 and "SHARDED_STATE_DICT" in str( + self.accelerator.state.fsdp_plugin.state_dict_type + ): if export_student: model = self.accelerator.unwrap_model(self.model) model = model.export() diff --git a/modelopt/torch/quantization/utils.py b/modelopt/torch/quantization/utils.py index 09faf58e0..6167daf23 100644 --- a/modelopt/torch/quantization/utils.py +++ b/modelopt/torch/quantization/utils.py @@ -25,7 +25,7 @@ from torch.distributed.fsdp import FSDPModule from torch.distributed.tensor import Replicate -from modelopt.torch.utils import print_rank_0 +from modelopt.torch.utils import get_unwrapped_name, print_rank_0 __all__ = [ "EXPORT_MODE", @@ -441,3 +441,26 @@ def enable_weight_access_and_writeback(module, root_model): with context: yield + + +def get_quantizer_state_dict(model: nn.Module): + """Get the state dict of the quantizers in the model.""" + # We should not call model.state_dict() here. + # With FSDP, model.state_dict() will hang if it is not called from all processes + from .nn import TensorQuantizer + + quantizer_state_dict = {} + for name, module in model.named_modules(): + if isinstance(module, TensorQuantizer): + quantizer_state_dict[get_unwrapped_name(name, model)] = module.state_dict() + return quantizer_state_dict + + +def set_quantizer_state_dict(model: nn.Module, quantizer_state_dict: dict): + """Set the state dict of the quantizers in the model.""" + from .nn import TensorQuantizer + + for name, module in model.named_modules(): + key = get_unwrapped_name(name, model) + if isinstance(module, TensorQuantizer) and key in quantizer_state_dict: + module.load_state_dict(quantizer_state_dict[key]) diff --git a/modelopt/torch/utils/network.py b/modelopt/torch/utils/network.py index 93dbffbdc..1940295c3 100644 --- a/modelopt/torch/utils/network.py +++ b/modelopt/torch/utils/network.py @@ -70,12 +70,19 @@ def _convert_to_wrapped_module_name(name: str) -> str: ] # NOTE: can be extended dynamically in appropriate plugin files if available (e.g. megatron core) -SUPPORTED_WRAPPERS = { +SUPPORTED_WRAPPERS: dict[type[nn.Module], str] = { nn.parallel.DataParallel: "module", # indicating attribute key to unwrap nn.parallel.DistributedDataParallel: "module", + torch.distributed.fsdp.FullyShardedDataParallel: "module", } -UNSUPPORTED_WRAPPERS = {torch.distributed.fsdp.FullyShardedDataParallel: "module"} +try: + from deepspeed.runtime.engine import DeepSpeedEngine +except: # noqa: E722 + DeepSpeedEngine = None + +if DeepSpeedEngine is not None: + SUPPORTED_WRAPPERS[DeepSpeedEngine] = "module" ModelLike = Union[nn.Module, type[nn.Module], tuple, Callable] # noqa: UP007 ConstructorLike = Callable | tuple @@ -430,11 +437,8 @@ def unwrap_model( """Unwrap a model that is wrapped by supported wrapper module or return original model.""" if force_unwrap: try: - if type(model) in SUPPORTED_WRAPPERS or type(model) in UNSUPPORTED_WRAPPERS: - return getattr( - model, - SUPPORTED_WRAPPERS.get(type(model), UNSUPPORTED_WRAPPERS.get(type(model))), # type: ignore [arg-type] - ) + if type(model) in SUPPORTED_WRAPPERS: + return getattr(model, SUPPORTED_WRAPPERS[type(model)]) except AttributeError: raise ValueError( f"Model of type {type(model)} could not be forcefully unwrapped! Please manually" @@ -447,11 +451,6 @@ def unwrap_model( elif warn: warnings.warn(msg or f"Model {model} is wrapped by {type(model)}; unwrapping...") return getattr(model, SUPPORTED_WRAPPERS[type(model)]) - elif type(model) in UNSUPPORTED_WRAPPERS: - raise ValueError( - f"Automatically unwrapping {type(model)} is not supported at this time! Please manually" - " unwrap the model before passing it in." - ) return model @@ -597,14 +596,17 @@ def delete_grad_hook(*_unused): return accum_grad, handle -def get_unwrapped_name(name: str) -> str: +def get_unwrapped_name(name: str, model: nn.Module | None = None) -> str: """Get the cleaned module name (i.e, the name before wrapping with sharded modules).""" # The distributed sharded wrappers such as FSDP wraps the child modules as well # So unwrapping just the parent module is not enough # Instead of unwrapping the child modules and changing the model, we can just clean the name # _convert_to_wrapped_module_name is a Pytorch utility function to do this + if isinstance(model, (nn.parallel.DistributedDataParallel, nn.parallel.DataParallel)) or ( + DeepSpeedEngine is not None and isinstance(model, DeepSpeedEngine) + ): + name = name.removeprefix("module.") - # TODO: Implement support for DeepSpeed Zero wrapped modules name = _convert_to_wrapped_module_name(name) name = name.removesuffix(".") return name diff --git a/tests/_test_utils/examples/run_command.py b/tests/_test_utils/examples/run_command.py index 8e6bbb641..cf31ce38c 100644 --- a/tests/_test_utils/examples/run_command.py +++ b/tests/_test_utils/examples/run_command.py @@ -32,9 +32,15 @@ def _extend_cmd_parts(cmd_parts: list[str], **kwargs): return cmd_parts -def run_example_command(cmd_parts: list[str], example_path: str): +def run_example_command(cmd_parts: list[str], example_path: str, setup_free_port: bool = False): print(f"[{example_path}] Running command: {cmd_parts}") - subprocess.run(cmd_parts, cwd=MODELOPT_ROOT / "examples" / example_path, check=True) + env = os.environ.copy() + + if setup_free_port: + free_port = get_free_port() + env["MASTER_PORT"] = str(free_port) + + subprocess.run(cmd_parts, cwd=MODELOPT_ROOT / "examples" / example_path, env=env, check=True) def run_command_in_background(cmd_parts, example_path, stdout=None, stderr=None, text=True): diff --git a/tests/examples/llm_qat/test_llm_qat.py b/tests/examples/llm_qat/test_llm_qat.py index 349824388..c9fef976d 100644 --- a/tests/examples/llm_qat/test_llm_qat.py +++ b/tests/examples/llm_qat/test_llm_qat.py @@ -33,10 +33,16 @@ def _run_command(extra_cmd_args: list[str]): *extra_cmd_args, ], "llm_qat", + setup_free_port=True, ) - -def test_llama_qat_int4w_int8a(tiny_llama_path, tmp_path): +@pytest.mark.parametrize("backend", [ + "fsdp1", + "fsdp2", + "deepspeed", + "ddp", +]) +def test_llama_qat_int4w_int8a(tiny_llama_path, tmp_path, backend): ptq_output_dir = tmp_path / "ptq" qat_output_dir = tmp_path / "qat" @@ -47,6 +53,7 @@ def test_llama_qat_int4w_int8a(tiny_llama_path, tmp_path): "--do_train", "False", "--quant_cfg", "INT4_WEIGHT_INT8_ACTIVATIONS", "--output_dir", ptq_output_dir, + "--backend", backend, ] ) @@ -56,9 +63,27 @@ def test_llama_qat_int4w_int8a(tiny_llama_path, tmp_path): "--model", ptq_output_dir, "--do_train", "True", "--output_dir", qat_output_dir, + "--backend", backend, ] ) +@pytest.mark.parametrize("backend", [ + "fsdp1", + "fsdp2", + "deepspeed", + "ddp", +]) +def test_llama_qat_int4w_int8a_direct_qat(tiny_llama_path, tmp_path, backend): + # Run PTQ + QAT together + _run_command( + [ + "--model", tiny_llama_path, + "--do_train", "True", + "--quant_cfg", "INT4_WEIGHT_INT8_ACTIVATIONS", + "--output_dir", tmp_path, + "--backend", backend, + ] + ) def test_llama_lora_qat_nvfp4(tiny_llama_path, tmp_path): _run_command( From e40fb07394585f895a51dc6b8428b7c750fabc6a Mon Sep 17 00:00:00 2001 From: Riyad Islam Date: Tue, 16 Sep 2025 21:21:06 -0700 Subject: [PATCH 14/27] import fix for torch 2.9 (#315) Signed-off-by: Riyad Islam Signed-off-by: Ye Yu --- modelopt/torch/quantization/export_onnx.py | 12 +++++++++++- .../quantization/nn/modules/tensor_quantizer.py | 6 +++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/quantization/export_onnx.py b/modelopt/torch/quantization/export_onnx.py index 984dba0b7..fe9bd927b 100644 --- a/modelopt/torch/quantization/export_onnx.py +++ b/modelopt/torch/quantization/export_onnx.py @@ -106,7 +106,7 @@ import onnx import torch -from torch.onnx import _type_utils, symbolic_helper +from torch.onnx import symbolic_helper from torch.onnx import symbolic_helper as sym_help from torch.onnx._internal import jit_utils from torch.onnx.symbolic_opset14 import _attention_scale, _causal_attention_mask @@ -290,6 +290,11 @@ def scaled_dot_product_attention( enable_gqa: bool = False, ): """Perform scaled dot product attention.""" + if hasattr(torch.onnx, "_type_utils"): + from torch.onnx import _type_utils + else: + from torch.onnx._internal.torchscript_exporter import _type_utils + assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), ( "is_causal and attn_mask cannot be set at the same time" ) @@ -393,6 +398,11 @@ def export_fp8_mha( """ from torch.onnx.symbolic_opset14 import _attention_scale, _causal_attention_mask + if hasattr(torch.onnx, "_type_utils"): + from torch.onnx import _type_utils + else: + from torch.onnx._internal.torchscript_exporter import _type_utils + # Pass all arguments, including x, to the custom ONNX operator assert (not is_causal) or (is_causal and sym_help._is_none(attn_mask)), ( "is_causal and attn_mask cannot be set at the same time" diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 9846f3554..0635b7c9b 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -30,7 +30,6 @@ import torch.nn.functional as F from torch import nn -from torch.onnx._globals import GLOBALS from modelopt.torch.utils import standardize_constructor_args from modelopt.torch.utils.distributed import DistributedProcessGroup @@ -879,6 +878,11 @@ def forward(self, inputs): Returns: outputs: A Tensor of type output_dtype """ + if hasattr(torch.onnx, "_globals"): + from torch.onnx._globals import GLOBALS + else: + from torch.onnx._internal.torchscript_exporter._globals import GLOBALS + if DTensor is not None and isinstance(inputs, DTensor): # TensorQuantizer only handles regular non-DTensor inputs device_mesh, placements = inputs.device_mesh, inputs.placements From 8f257177dc80eaa8bb821c16fb65ead33864c574 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Thu, 18 Sep 2025 09:06:56 -0700 Subject: [PATCH 15/27] checkout main branch's fix for megatron+importer Signed-off-by: Ye Yu --- modelopt/torch/export/plugins/megatron_importer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py index 31d748726..696af6323 100644 --- a/modelopt/torch/export/plugins/megatron_importer.py +++ b/modelopt/torch/export/plugins/megatron_importer.py @@ -512,10 +512,7 @@ def _import_state_dict(self): self.rules["k_layernorm"](attention.k_layernorm, layer_id) self.rules["linear_qkv"](attention.linear_qkv, layer_id) self.rules["linear_proj"](attention.linear_proj, layer_id) - if ( - hasattr(attention.core_attention, "softmax_offset") - and attention.core_attention.softmax_offset is not None - ): + if hasattr(attention.core_attention, "softmax_offset"): self.rules["softmax_offset"]( attention.core_attention.softmax_offset, layer_id ) From 912e42737bae32eb795628c896154bf593cecd43 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Thu, 18 Sep 2025 12:44:07 -0700 Subject: [PATCH 16/27] pad labels if it's 1 token shorter than input_ids Signed-off-by: Ye Yu --- .../speculative/plugins/megatron_eagle.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index 4062be8b6..2a0e63a3c 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -1073,15 +1073,13 @@ def _compute_eagle_loss(self, logits, labels, eagle_logits): """Compute the total loss for EAGLE. logits: [s, b, vocab // TP] - labels: [b, s] or [b, s-1] for offline mode + labels: [b, s] eagle_logits: [s, b, vocab // TP] """ # Compute lm loss (classification loss) or KLDivergence if self.eagle_self_logit_distillation: mapping = self.eagle_module.d2t if hasattr(self.eagle_module, "d2t") else None token_loss = self.kld(eagle_logits[:-1, :, :], logits[1:, :, :], mapping) - elif labels.shape[1] < eagle_logits.shape[0]: - token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-2, :, :]) else: token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-1, :, :]) @@ -1281,20 +1279,26 @@ def forward( # all eagle weights have been exercised for quantization calibration purpose. if labels is None: return logits_sbh.transpose(0, 1).contiguous() + elif labels.shape[1] == input_ids.shape[1] - 1: + # For offline training, labels may be 1 token shorter than input_ids. + # We will just pad a 0 to the labels to make the seq_len the same as + # input_ids. This will introduce a small error in training if logit_distillation + # is False, and testing accuracy is wrong for the last token. + right_token_pad = torch.zeros( + (labels.shape[0], 1), + dtype=labels.dtype, + device=labels.device, + ) + labels = torch.cat((labels, right_token_pad), dim=-1) # If eagle_freeze_base_model is set to True, # the base model is frozen . - if self.eagle_offline: - loss = torch.zeros(input_ids.shape).to(input_ids.device) - else: - loss = self.compute_language_model_loss(labels, logits_sbh) + loss = self.compute_language_model_loss(labels, logits_sbh) loss = 0.0 * loss if self.eagle_config.parallel_draft_step > 1: for i in range(self.eagle_config.parallel_draft_step): - eagle_logits = eagle_logits_0[ - i * logits_sbh.shape[0] : (i + 1) * logits_sbh.shape[0] - ] + eagle_logits = eagle_logits_0[i * labels.shape[1] : (i + 1) * labels.shape[1]] loss_ = self._compute_eagle_loss(logits_sbh, labels, eagle_logits) loss_ = loss_[:, i:] loss[:, i + 1 :] += 1.0 * loss_ @@ -1307,7 +1311,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_0[:-2, :, :] if self.eagle_offline else eagle_logits_0[:-1, :, :] + eagle_logits_0[:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1337,7 +1341,7 @@ def forward( packed_seq_params=packed_seq_params, **(extra_block_kwargs or {}), ) - eagle_logits_1 = eagle_logits_2x[logits_sbh.shape[0] :, :, :] + eagle_logits_1 = eagle_logits_2x[-labels.shape[1] :, :, :] loss_1 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_1) # [b, s - 2] @@ -1348,7 +1352,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_1[1:-2, :, :] if self.eagle_offline else eagle_logits_1[1:-1, :, :] + eagle_logits_1[1:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1379,7 +1383,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_2 = eagle_logits_3x[-logits_sbh.shape[0] :, :, :] + eagle_logits_2 = eagle_logits_3x[-labels.shape[1] :, :, :] loss_2 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_2) # [b, s - 3] @@ -1390,7 +1394,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_2[2:-2, :, :] if self.eagle_offline else eagle_logits_2[2:-1, :, :] + eagle_logits_2[2:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1421,7 +1425,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_3 = eagle_logits_4x[-logits_sbh.shape[0] :, :, :] + eagle_logits_3 = eagle_logits_4x[-labels.shape[1] :, :, :] loss_3 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_3) # [b, s - 4] @@ -1432,7 +1436,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_3[3:-2, :, :] if self.eagle_offline else eagle_logits_3[3:-1, :, :] + eagle_logits_3[3:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: From 0582079c2d0fd8b8056453f2b89c4d9eadb9a5f1 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Thu, 18 Sep 2025 12:44:45 -0700 Subject: [PATCH 17/27] Revert "pad labels if it's 1 token shorter than input_ids" This reverts commit 9450e0d8729368b0149443acc9ec239432c4e5c7. Signed-off-by: Ye Yu --- .../speculative/plugins/megatron_eagle.py | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index 2a0e63a3c..4062be8b6 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -1073,13 +1073,15 @@ def _compute_eagle_loss(self, logits, labels, eagle_logits): """Compute the total loss for EAGLE. logits: [s, b, vocab // TP] - labels: [b, s] + labels: [b, s] or [b, s-1] for offline mode eagle_logits: [s, b, vocab // TP] """ # Compute lm loss (classification loss) or KLDivergence if self.eagle_self_logit_distillation: mapping = self.eagle_module.d2t if hasattr(self.eagle_module, "d2t") else None token_loss = self.kld(eagle_logits[:-1, :, :], logits[1:, :, :], mapping) + elif labels.shape[1] < eagle_logits.shape[0]: + token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-2, :, :]) else: token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-1, :, :]) @@ -1279,26 +1281,20 @@ def forward( # all eagle weights have been exercised for quantization calibration purpose. if labels is None: return logits_sbh.transpose(0, 1).contiguous() - elif labels.shape[1] == input_ids.shape[1] - 1: - # For offline training, labels may be 1 token shorter than input_ids. - # We will just pad a 0 to the labels to make the seq_len the same as - # input_ids. This will introduce a small error in training if logit_distillation - # is False, and testing accuracy is wrong for the last token. - right_token_pad = torch.zeros( - (labels.shape[0], 1), - dtype=labels.dtype, - device=labels.device, - ) - labels = torch.cat((labels, right_token_pad), dim=-1) # If eagle_freeze_base_model is set to True, # the base model is frozen . - loss = self.compute_language_model_loss(labels, logits_sbh) + if self.eagle_offline: + loss = torch.zeros(input_ids.shape).to(input_ids.device) + else: + loss = self.compute_language_model_loss(labels, logits_sbh) loss = 0.0 * loss if self.eagle_config.parallel_draft_step > 1: for i in range(self.eagle_config.parallel_draft_step): - eagle_logits = eagle_logits_0[i * labels.shape[1] : (i + 1) * labels.shape[1]] + eagle_logits = eagle_logits_0[ + i * logits_sbh.shape[0] : (i + 1) * logits_sbh.shape[0] + ] loss_ = self._compute_eagle_loss(logits_sbh, labels, eagle_logits) loss_ = loss_[:, i:] loss[:, i + 1 :] += 1.0 * loss_ @@ -1311,7 +1307,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_0[:-1, :, :] + eagle_logits_0[:-2, :, :] if self.eagle_offline else eagle_logits_0[:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1341,7 +1337,7 @@ def forward( packed_seq_params=packed_seq_params, **(extra_block_kwargs or {}), ) - eagle_logits_1 = eagle_logits_2x[-labels.shape[1] :, :, :] + eagle_logits_1 = eagle_logits_2x[logits_sbh.shape[0] :, :, :] loss_1 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_1) # [b, s - 2] @@ -1352,7 +1348,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_1[1:-1, :, :] + eagle_logits_1[1:-2, :, :] if self.eagle_offline else eagle_logits_1[1:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1383,7 +1379,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_2 = eagle_logits_3x[-labels.shape[1] :, :, :] + eagle_logits_2 = eagle_logits_3x[-logits_sbh.shape[0] :, :, :] loss_2 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_2) # [b, s - 3] @@ -1394,7 +1390,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_2[2:-1, :, :] + eagle_logits_2[2:-2, :, :] if self.eagle_offline else eagle_logits_2[2:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1425,7 +1421,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_3 = eagle_logits_4x[-labels.shape[1] :, :, :] + eagle_logits_3 = eagle_logits_4x[-logits_sbh.shape[0] :, :, :] loss_3 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_3) # [b, s - 4] @@ -1436,7 +1432,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_3[3:-1, :, :] + eagle_logits_3[3:-2, :, :] if self.eagle_offline else eagle_logits_3[3:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: From ca7ec418848b1e4c29b2cd4a2ceb69e5294a7eaa Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Thu, 18 Sep 2025 12:46:36 -0700 Subject: [PATCH 18/27] pad labels if it's 1 token shorter than input_ids Signed-off-by: Ye Yu --- .../speculative/plugins/megatron_eagle.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py index 4062be8b6..2a0e63a3c 100644 --- a/modelopt/torch/speculative/plugins/megatron_eagle.py +++ b/modelopt/torch/speculative/plugins/megatron_eagle.py @@ -1073,15 +1073,13 @@ def _compute_eagle_loss(self, logits, labels, eagle_logits): """Compute the total loss for EAGLE. logits: [s, b, vocab // TP] - labels: [b, s] or [b, s-1] for offline mode + labels: [b, s] eagle_logits: [s, b, vocab // TP] """ # Compute lm loss (classification loss) or KLDivergence if self.eagle_self_logit_distillation: mapping = self.eagle_module.d2t if hasattr(self.eagle_module, "d2t") else None token_loss = self.kld(eagle_logits[:-1, :, :], logits[1:, :, :], mapping) - elif labels.shape[1] < eagle_logits.shape[0]: - token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-2, :, :]) else: token_loss = self.compute_language_model_loss(labels[:, 1:], eagle_logits[:-1, :, :]) @@ -1281,20 +1279,26 @@ def forward( # all eagle weights have been exercised for quantization calibration purpose. if labels is None: return logits_sbh.transpose(0, 1).contiguous() + elif labels.shape[1] == input_ids.shape[1] - 1: + # For offline training, labels may be 1 token shorter than input_ids. + # We will just pad a 0 to the labels to make the seq_len the same as + # input_ids. This will introduce a small error in training if logit_distillation + # is False, and testing accuracy is wrong for the last token. + right_token_pad = torch.zeros( + (labels.shape[0], 1), + dtype=labels.dtype, + device=labels.device, + ) + labels = torch.cat((labels, right_token_pad), dim=-1) # If eagle_freeze_base_model is set to True, # the base model is frozen . - if self.eagle_offline: - loss = torch.zeros(input_ids.shape).to(input_ids.device) - else: - loss = self.compute_language_model_loss(labels, logits_sbh) + loss = self.compute_language_model_loss(labels, logits_sbh) loss = 0.0 * loss if self.eagle_config.parallel_draft_step > 1: for i in range(self.eagle_config.parallel_draft_step): - eagle_logits = eagle_logits_0[ - i * logits_sbh.shape[0] : (i + 1) * logits_sbh.shape[0] - ] + eagle_logits = eagle_logits_0[i * labels.shape[1] : (i + 1) * labels.shape[1]] loss_ = self._compute_eagle_loss(logits_sbh, labels, eagle_logits) loss_ = loss_[:, i:] loss[:, i + 1 :] += 1.0 * loss_ @@ -1307,7 +1311,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_0[:-2, :, :] if self.eagle_offline else eagle_logits_0[:-1, :, :] + eagle_logits_0[:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1337,7 +1341,7 @@ def forward( packed_seq_params=packed_seq_params, **(extra_block_kwargs or {}), ) - eagle_logits_1 = eagle_logits_2x[logits_sbh.shape[0] :, :, :] + eagle_logits_1 = eagle_logits_2x[-labels.shape[1] :, :, :] loss_1 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_1) # [b, s - 2] @@ -1348,7 +1352,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_1[1:-2, :, :] if self.eagle_offline else eagle_logits_1[1:-1, :, :] + eagle_logits_1[1:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1379,7 +1383,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_2 = eagle_logits_3x[-logits_sbh.shape[0] :, :, :] + eagle_logits_2 = eagle_logits_3x[-labels.shape[1] :, :, :] loss_2 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_2) # [b, s - 3] @@ -1390,7 +1394,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_2[2:-2, :, :] if self.eagle_offline else eagle_logits_2[2:-1, :, :] + eagle_logits_2[2:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: @@ -1421,7 +1425,7 @@ def forward( **(extra_block_kwargs or {}), ) - eagle_logits_3 = eagle_logits_4x[-logits_sbh.shape[0] :, :, :] + eagle_logits_3 = eagle_logits_4x[-labels.shape[1] :, :, :] loss_3 = self._compute_eagle_loss(logits_sbh, labels, eagle_logits_3) # [b, s - 4] @@ -1432,7 +1436,7 @@ def forward( acc = [] with torch.no_grad(): gathered_logits = gather_from_tensor_model_parallel_region( - eagle_logits_3[3:-2, :, :] if self.eagle_offline else eagle_logits_3[3:-1, :, :] + eagle_logits_3[3:-1, :, :] ) eagle_top1 = gathered_logits.transpose(0, 1).argmax(dim=-1) if self.eagle_config.draft_vocab_size != self.eagle_config.vocab_size: From a6a080569244fbc555a942d09b5a14ec13db41bc Mon Sep 17 00:00:00 2001 From: yueshen2016 <39203804+yueshen2016@users.noreply.github.com> Date: Wed, 17 Sep 2025 13:25:38 -0700 Subject: [PATCH 19/27] Fix issues of attention.core_attention.softmax_offset is None (#330) Signed-off-by: Yue Signed-off-by: Ye Yu --- modelopt/torch/export/plugins/megatron_importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py index 696af6323..4c805dc01 100644 --- a/modelopt/torch/export/plugins/megatron_importer.py +++ b/modelopt/torch/export/plugins/megatron_importer.py @@ -512,7 +512,7 @@ def _import_state_dict(self): self.rules["k_layernorm"](attention.k_layernorm, layer_id) self.rules["linear_qkv"](attention.linear_qkv, layer_id) self.rules["linear_proj"](attention.linear_proj, layer_id) - if hasattr(attention.core_attention, "softmax_offset"): + if getattr(attention.core_attention, "softmax_offset", None) is not None: self.rules["softmax_offset"]( attention.core_attention.softmax_offset, layer_id ) From 0e01a5d60cd35d8f95552a4318939d60de768c9f Mon Sep 17 00:00:00 2001 From: h-guo18 <67671475+h-guo18@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:17:50 -0700 Subject: [PATCH 20/27] update eagle example notebook (#314) Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> Signed-off-by: Ye Yu --- examples/speculative_decoding/example.ipynb | 456 +++++++++++++------- 1 file changed, 302 insertions(+), 154 deletions(-) diff --git a/examples/speculative_decoding/example.ipynb b/examples/speculative_decoding/example.ipynb index 4278f0e22..e9a84a059 100644 --- a/examples/speculative_decoding/example.ipynb +++ b/examples/speculative_decoding/example.ipynb @@ -4,33 +4,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Synthesize data for speculative decoding training\n", - "\n", - "The speculative decoding medule needs to learn to predict tokens from the base model. Therefore, we need to prepare the data generated from the base model.\n", - "Note: if the target base model is a quantized version, the synthesized data should be generated using the quantized model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, quantize the base model (Llama-3.2-1B-Instruct) into FP8 and export to unified export format." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python llm_ptq/hf_ptq.py --pyt_ckpt_path meta-llama/Llama-3.2-1B-Instruct --qformat fp8 --batch_size 1 --export_path /tmp/llama3.2_1B_fp8 --export_fmt hf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, download the Daring-Anteater dataset." + "## Prepare Data\n", + "In this example, we use the Daring-Anteater dataset. For improved accuracy, please refer to the [Data Synthesis Section](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/speculative_decoding#optional-data-synthesis) in the README." ] }, { @@ -46,41 +21,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, launch an inference server that will run the quantized base model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!vllm serve /tmp/llama3.2_1B_fp8 --api-key token-abc123 --port 8000 --tensor-parallel-size 1 --quantization=modelopt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Open a new terminal and adapt the fine-tuning data by calling this server.\n", - "Note: this may take a long time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!mkdir /tmp/finetune\n", - "!bash prepare_data.sh --data_path /tmp/Daring-Anteater/train.jsonl --output_path /tmp/finetune/data.jsonl --max_token 2048" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's load the base model and convert it to EAGLE Model" + "## Convert Model for Speculative Decoding\n", + "Here, we'll adapt our base model for speculative decoding by attaching a smaller EAGLE module. The upcoming code first loads meta-llama/Llama-3.2-1B as our base model and then configures the new draft module. To ensure compatibility, the draft module's dimensions must match the target model. Finally, the modelopt toolkit attaches this new, untrained module, leaving us with a combined model that is ready for the training phase later." ] }, { @@ -93,28 +35,49 @@ "\n", "import modelopt.torch.opt as mto\n", "import modelopt.torch.speculative as mtsp\n", + "from modelopt.torch.speculative.config import EAGLE3_DEFAULT_CFG\n", "\n", "mto.enable_huggingface_checkpointing()\n", "\n", + "# Load original HF model\n", + "base_model = \"meta-llama/Llama-3.2-1B\"\n", "model = transformers.AutoModelForCausalLM.from_pretrained(\n", - " \"meta-llama/Llama-3.2-1B-Instruct\", torch_dtype=\"auto\"\n", + " base_model, torch_dtype=\"auto\", device_map=\"cuda\"\n", ")\n", - "config = {\n", - " \"eagle_num_layers\": 1,\n", - " \"use_input_layernorm_in_first_layer\": True,\n", - " \"use_last_layernorm\": False,\n", - "}\n", + "\n", + "# Read Default Config for EAGLE3\n", + "config = EAGLE3_DEFAULT_CFG[\"config\"]\n", + "\n", + "# Hidden size and vocab size must match base model\n", + "config[\"eagle_architecture_config\"].update(\n", + " {\n", + " \"hidden_size\": model.config.hidden_size,\n", + " \"vocab_size\": model.config.vocab_size,\n", + " \"draft_vocab_size\": model.config.vocab_size,\n", + " \"max_position_embeddings\": model.config.max_position_embeddings,\n", + " }\n", + ")\n", + "\n", + "# Convert Model for eagle speculative decoding\n", "mtsp.convert(model, [(\"eagle\", config)])\n", "\n", - "tokenizer = transformers.AutoTokenizer.from_pretrained(\"meta-llama/Llama-3.2-1B-Instruct\")\n", - "tokenizer.pad_token_id = tokenizer.eos_token_id" + "# Prepare Tokenizer\n", + "tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, model_max_length=1024)\n", + "tokenizer.pad_token_id = tokenizer.eos_token_id\n", + "if tokenizer.chat_template is None:\n", + " tokenizer.chat_template = (\n", + " \"{%- for message in messages %}\"\n", + " \"{{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}\"\n", + " \"{%- endfor %}\"\n", + " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Once synthesized data is ready, we can start training the eagle model." + "## Train Draft module On Daring-Anteater\n", + "We will fine-tune the draft module on the Daring-Anteater dataset using the standard Hugging Face Trainer. Note that only the draft module's weights are updated during this process; the original target model remains frozen. After training, our speculative decoding model will be ready for export and deployment. Note that the time to train will be significantly dependent on the epochs (default=4) and the hardware being used." ] }, { @@ -126,10 +89,10 @@ "import json\n", "from dataclasses import dataclass, field\n", "\n", - "from speculative_decoding.eagle_utils import DataCollatorWithPadding, LazySupervisedDataset\n", + "from eagle_utils import DataCollatorWithPadding, LazySupervisedDataset\n", "from transformers import Trainer\n", "\n", - "with open(\"/tmp/finetune/data.jsonl\") as f:\n", + "with open(\"/tmp/Daring-Anteater/train.jsonl\") as f:\n", " data_json = [json.loads(line) for line in f]\n", "train_dataset = LazySupervisedDataset(data_json[: int(len(data_json) * 0.95)], tokenizer=tokenizer)\n", "eval_dataset = LazySupervisedDataset(data_json[int(len(data_json) * 0.95) :], tokenizer=tokenizer)\n", @@ -137,22 +100,13 @@ "\n", "@dataclass\n", "class TrainingArguments(transformers.TrainingArguments):\n", - " cache_dir: str | None = field(default=None)\n", - " model_max_length: int = field(\n", - " default=4096,\n", - " metadata={\n", - " \"help\": (\n", - " \"Maximum sequence length. Sequences will be right padded (and possibly truncated).\"\n", - " )\n", - " },\n", - " )\n", " dataloader_drop_last: bool = field(default=True)\n", " bf16: bool = field(default=True)\n", "\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"/tmp/eagle_bf16\",\n", - " num_train_epochs=1.0,\n", + " num_train_epochs=4,\n", " per_device_train_batch_size=1,\n", " per_device_eval_batch_size=1,\n", ")\n", @@ -166,25 +120,47 @@ ")\n", "trainer._move_model_to_device(model, trainer.args.device)\n", "\n", - "# Manually enable this to return loss in eval\n", - "trainer.can_return_loss = True\n", "# Make sure label_smoother is None\n", "assert trainer.label_smoother is None, \"label_smoother is not supported in speculative decoding!\"\n", "\n", "trainer.train()\n", "trainer.save_state()\n", "trainer.save_model(training_args.output_dir)\n", - "tokenizer.save_pretrained(training_args.output_dir)\n", + "tokenizer.save_pretrained(training_args.output_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Export Model Checkpoint\n", + "To deploy this model, we need to first export it to a Unified checkpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from modelopt.torch.export import export_hf_checkpoint\n", "\n", - "metrics = trainer.evaluate()\n", - "print(f\"Evaluation results: \\n{metrics}\")" + "model.eval()\n", + "export_hf_checkpoint(\n", + " model,\n", + " export_dir=\"/tmp/hf_ckpt\",\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we have a EAGLE model in BF16 format. Next, we quantize this model into FP8 (PTQ)." + "## Deploying on TensorRT-LLM\n", + "\n", + "Here we show an example to deploy on TRT-LLM with `trtllm-serve` and [TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release). See [Deployment](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/speculative_decoding#deployment) section for more info. \n", + "\n", + "First, we dump the `trtllm-serve` command and config file we need to `/tmp` folder." ] }, { @@ -193,42 +169,45 @@ "metadata": {}, "outputs": [], "source": [ - "import modelopt.torch.quantization as mtq\n", - "import modelopt.torch.utils.dataset_utils as dataset_utils\n", + "trtllm_serve_script = f\"\"\"trtllm-serve {base_model} \\\\\n", + " --host 0.0.0.0 \\\\\n", + " --port 8000 \\\\\n", + " --backend pytorch \\\\\n", + " --max_batch_size 32 \\\\\n", + " --max_num_tokens 8192 \\\\\n", + " --max_seq_len 8192 \\\\\n", + " --extra_llm_api_options /tmp/extra-llm-api-config.yml\n", + "\"\"\"\n", "\n", - "mto.enable_huggingface_checkpointing()\n", + "extra_llm_api_config = \"\"\"enable_attention_dp: false\n", + "disable_overlap_scheduler: true\n", + "enable_autotuner: false\n", "\n", - "model = transformers.AutoModelForCausalLM.from_pretrained(\"/tmp/eagle_bf16\")\n", - "tokenizer = transformers.AutoTokenizer.from_pretrained(\"/tmp/eagle_bf16\")\n", + "cuda_graph_config:\n", + " max_batch_size: 1\n", "\n", - "calib_dataloader = dataset_utils.get_dataset_dataloader(\n", - " dataset_name=\"cnn_dailymail\",\n", - " tokenizer=tokenizer,\n", - " batch_size=1,\n", - " num_samples=512,\n", - " device=model.device,\n", - " include_labels=False,\n", - ")\n", + "speculative_config:\n", + " decoding_type: Eagle\n", + " max_draft_len: 3\n", + " speculative_model_dir: /tmp/hf_ckpt\n", "\n", - "quant_cfg = getattr(mtq, \"FP8_DEFAULT_CFG\")\n", - "quant_cfg[\"quant_cfg\"][\"*output_quantizer\"] = {\n", - " \"num_bits\": (4, 3),\n", - " \"axis\": None,\n", - " \"enable\": True,\n", - "}\n", + "kv_cache_config:\n", + " enable_block_reuse: false\n", + "\"\"\"\n", "\n", - "calibrate_loop = dataset_utils.create_forward_loop(calib_dataloader, dataloader=calib_dataloader)\n", - "model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)\n", - "mtq.print_quant_summary(model)\n", + "# Dump the two scripts into /tmp\n", + "with open(\"/tmp/trtllm_serve.sh\", \"w\") as f:\n", + " f.write(trtllm_serve_script)\n", "\n", - "model.save_pretrained(\"/tmp/eagle_fp8_ptq\")" + "with open(\"/tmp/extra-llm-api-config.yml\", \"w\") as f:\n", + " f.write(extra_llm_api_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To maintain the accuracy, we need to finetune the model (QAT)." + "Next, we start a TRT-LLM container in the background and run `trtllm-serve` inside it, using our exported checkpoint and the configuration scripts we just created:" ] }, { @@ -237,36 +216,62 @@ "metadata": {}, "outputs": [], "source": [ - "training_args.output_dir = \"/tmp/eagle_fp8_qat\"\n", - "trainer = Trainer(\n", - " model=model,\n", - " tokenizer=tokenizer,\n", - " args=training_args,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " data_collator=DataCollatorWithPadding(),\n", + "import subprocess\n", + "import threading\n", + "\n", + "# Generate a unique container name so we can stop/remove it later\n", + "container_name = \"trtllm_serve_spec\"\n", + "\n", + "docker_cmd = [\n", + " \"docker\",\n", + " \"run\",\n", + " \"--rm\",\n", + " \"--net\",\n", + " \"host\",\n", + " \"--shm-size=2g\",\n", + " \"--ulimit\",\n", + " \"memlock=-1\",\n", + " \"--ulimit\",\n", + " \"stack=67108864\",\n", + " \"--gpus\",\n", + " \"all\",\n", + " \"-v\",\n", + " \"/tmp:/tmp\",\n", + " \"--name\",\n", + " container_name,\n", + " \"nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2\",\n", + " \"bash\",\n", + " \"-c\",\n", + " \"bash /tmp/trtllm_serve.sh\",\n", + "]\n", + "\n", + "# print docker outputs\n", + "proc = subprocess.Popen(\n", + " docker_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1\n", ")\n", - "trainer._move_model_to_device(model, trainer.args.device)\n", "\n", - "# Manually enable this to return loss in eval\n", - "trainer.can_return_loss = True\n", - "# Make sure label_smoother is None\n", - "assert trainer.label_smoother is None, \"label_smoother is not supported in speculative decoding!\"\n", "\n", - "trainer.train()\n", - "trainer.save_state()\n", - "trainer.save_model(training_args.output_dir)\n", - "tokenizer.save_pretrained(training_args.output_dir)\n", + "def stream_output(pipe):\n", + " for line in iter(pipe.readline, \"\"):\n", + " print(line, end=\"\")\n", + "\n", + "\n", + "# Use thread to print outputs\n", + "thread = threading.Thread(target=stream_output, args=(proc.stdout,))\n", + "thread.daemon = True\n", + "thread.start()\n", "\n", - "metrics = trainer.evaluate()\n", - "print(f\"Evaluation results: \\n{metrics}\")" + "print(\n", + " f\"Starting trtllm-serve in Docker (PID: {proc.pid}, container name: {container_name}) in the background:\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To deploy this model, we need to first export it to a Unified checkpoint." + "Please wait for the service to fully start inside the container. \n", + "Once you see the message `INFO: Application startup complete.`, you can proceed to send requests to the service:" ] }, { @@ -275,24 +280,84 @@ "metadata": {}, "outputs": [], "source": [ - "from accelerate.hooks import remove_hook_from_module\n", + "import json\n", "\n", - "from modelopt.torch.export import export_hf_checkpoint\n", + "import requests\n", "\n", - "# Move meta tensor back to device before exporting.\n", - "remove_hook_from_module(model, recurse=True)\n", + "payload = {\n", + " \"model\": base_model,\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"Tell me about speculative decoding.\"},\n", + " ],\n", + " \"max_tokens\": 512,\n", + " \"temperature\": 0,\n", + " \"chat_template\": tokenizer.chat_template,\n", + "}\n", + "headers = {\"Content-Type\": \"application/json\", \"Accept\": \"application/json\"}\n", "\n", - "export_hf_checkpoint(\n", - " model,\n", - " export_dir=\"/tmp/hf_ckpt\",\n", - ")" + "response = requests.post(\n", + " \"http://localhost:8000/v1/chat/completions\", headers=headers, data=json.dumps(payload)\n", + ")\n", + "output = response.json()\n", + "\n", + "print(output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we clean up the container we created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker rm -f trtllm_serve_spec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploying on SGLang\n", + "Here, we deploy our trained model using SGLang. The following code defines the command needed to run the SGLang server with our specific configuration for speculative decoding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# SGLang server launch command shell script\n", + "sglang_serve_script = f\"\"\"python3 -m sglang.launch_server \\\\\n", + " --model {base_model} \\\\\n", + " --host 0.0.0.0 \\\\\n", + " --port 30000 \\\\\n", + " --speculative-algorithm EAGLE3 \\\\\n", + " --speculative-eagle-topk 8 \\\\\n", + " --speculative-draft-model-path /tmp/hf_ckpt \\\\\n", + " --speculative-num-draft-tokens 3 \\\\\n", + " --speculative-num-steps 3 \\\\\n", + " --mem-fraction 0.6 \\\\\n", + " --cuda-graph-max-bs 2 \\\\\n", + " --dtype float16\n", + "\"\"\"\n", + "\n", + "with open(\"/tmp/sglang_serve.sh\", \"w\") as f:\n", + " f.write(sglang_serve_script)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Then convert the Unified ckeckpoint to TRTLLM checkpoint." + "Launch the SGLang server inside a Docker container as a background process." ] }, { @@ -301,14 +366,67 @@ "metadata": {}, "outputs": [], "source": [ - "!python TensorRT-LLM/examples/eagle/convert_checkpoint.py --model_dir /tmp/hf_ckpt --output_dir /tmp/trtllm_ckpt --num_eagle_layers 5 --max_non_leaves_per_layer 4 --max_draft_len 25 --dtype float16" + "import os\n", + "import subprocess\n", + "import threading\n", + "\n", + "container_name = \"sglang_serve_spec\"\n", + "home_dir = os.path.expanduser(\"~\")\n", + "hf_cache_dir = os.path.join(home_dir, \".cache\", \"huggingface\")\n", + "\n", + "# Ensure the Hugging Face cache directory exists. This directory should exist as ~/.cache/huggingface, when the model files for meta-llama/Llama-3.2-1B were downloaded earlier.\n", + "os.makedirs(hf_cache_dir, exist_ok=True)\n", + "\n", + "docker_cmd = [\n", + " \"docker\",\n", + " \"run\",\n", + " \"--rm\",\n", + " \"--net\",\n", + " \"host\",\n", + " \"--shm-size=32g\",\n", + " \"--gpus\",\n", + " \"all\",\n", + " \"-v\",\n", + " f\"{hf_cache_dir}:/root/.cache/huggingface\",\n", + " \"-v\",\n", + " \"/tmp:/tmp\",\n", + " \"--ipc=host\",\n", + " \"--name\",\n", + " container_name,\n", + " \"lmsysorg/sglang:latest\",\n", + " \"bash\",\n", + " \"-c\",\n", + " \"bash /tmp/sglang_serve.sh\",\n", + "]\n", + "\n", + "# Launch the Docker container\n", + "proc = subprocess.Popen(\n", + " docker_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1\n", + ")\n", + "\n", + "\n", + "# Stream the process output\n", + "def stream_output(pipe):\n", + " for line in iter(pipe.readline, \"\"):\n", + " print(line, end=\"\")\n", + "\n", + "\n", + "# Use a thread to stream the output in without blocking the notebook\n", + "thread = threading.Thread(target=stream_output, args=(proc.stdout,))\n", + "thread.daemon = True\n", + "thread.start()\n", + "\n", + "print(\n", + " f\"Starting SGLang server in Docker (PID: {proc.pid}, container name: {container_name}) in the background:\"\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Last, build a TensorRT-LLM engine." + "As with TRT-LLM, please wait for the service to fully start inside the container. \n", + "Once you see the message `INFO: Application startup complete.`, you can proceed to send requests to the service:" ] }, { @@ -317,14 +435,35 @@ "metadata": {}, "outputs": [], "source": [ - "!trtllm-build --checkpoint_dir /tmp/trtllm_ckpt --output_dir /tmp/trtllm_engine --gemm_plugin float16 --use_paged_context_fmha enable --speculative_decoding_mode eagle --max_batch_size 4" + "import json\n", + "\n", + "import requests\n", + "\n", + "payload = {\n", + " \"model\": base_model,\n", + " \"messages\": [\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"Tell me about speculative decoding.\"},\n", + " ],\n", + " \"max_tokens\": 512,\n", + " \"temperature\": 0,\n", + "}\n", + "headers = {\"Content-Type\": \"application/json\", \"Accept\": \"application/json\"}\n", + "\n", + "# Send request to the SGLang server\n", + "response = requests.post(\n", + " \"http://localhost:30000/v1/chat/completions\", headers=headers, data=json.dumps(payload)\n", + ")\n", + "output = response.json()\n", + "\n", + "print(output)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To run the EAGLE engine, please refer to [TensorRT-LLM/examples/eagle](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/eagle):" + "Clean up the container" ] }, { @@ -333,18 +472,27 @@ "metadata": {}, "outputs": [], "source": [ - "!python ../run.py --engine_dir /tmp/trtllm_engine \\\n", - " --tokenizer_dir /tmp/eagle_fp8_qat \\\n", - " --max_output_len=100 \\\n", - " --eagle_choices=\"[[0],[1],[2],[3],[0,0],[0,1],[0,2],[1,0],[1,1],[2,0],[2,1],[3,0],[0,0,0],[0,0,1],[0,0,2],[0,1,0],[0,1,1],[0,2,0],[0,2,1],[1,0,0],[0,0,0,0],[0,0,0,1],[0,0,0,2],[0,0,0,0,0],[0,0,0,0,1]]\" \\\n", - " --temperature 1.0 \\\n", - " --input_text \"Once upon\"" + "!docker rm -f sglang_serve_spec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploying on vLLM (Coming Soon)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While vLLM is another extremely popular, high-performance inference server, direct support for speculative decoding with this demo notebook is still under active development. This notebook will be updated once deployment is possible." ] } ], "metadata": { "kernelspec": { - "display_name": "py312", + "display_name": "modelopt+vllm", "language": "python", "name": "python3" }, @@ -358,7 +506,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.12.0" } }, "nbformat": 4, From 5f6bedcbe5aeb3251a9c08d2c9d6cb369898888e Mon Sep 17 00:00:00 2001 From: Asha Anoosheh Date: Wed, 17 Sep 2025 23:52:02 +0200 Subject: [PATCH 21/27] Allow KD loss in val mode for MLM plugin (#331) Signed-off-by: Asha Anoosheh Signed-off-by: Ye Yu --- modelopt/torch/distill/plugins/megatron.py | 27 ++++------------------ 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/modelopt/torch/distill/plugins/megatron.py b/modelopt/torch/distill/plugins/megatron.py index 6e712fcbb..7078cca36 100644 --- a/modelopt/torch/distill/plugins/megatron.py +++ b/modelopt/torch/distill/plugins/megatron.py @@ -532,10 +532,6 @@ def _set_input_tensor(self, input_tensors: list[Tensor]): # HACK: Concatenate output tensors when PP>1 so they can be passed between ranks. def _forward(self, *args, **kwargs): - if not self.training: - with self.only_student_forward(): - return type(self).forward(self, *args, **kwargs) - with torch.no_grad(): self._teacher_model.eval() teacher_output = self._teacher_model(*args, **kwargs) @@ -551,11 +547,7 @@ def _forward(self, *args, **kwargs): def get_tensor_shapes_adjust_fn_for_distillation( - model: torch.nn.Module | list[torch.nn.Module], - seq_length: int, - micro_batch_size: int, - decoder_seq_length: int | None = None, - forward_only: bool = False, + model: torch.nn.Module | list[torch.nn.Module], **kwargs ) -> Callable | None: """Return the function to adjust tensor shapes for Distillation in Megatron-Core's forward pass. @@ -563,8 +555,7 @@ def get_tensor_shapes_adjust_fn_for_distillation( Concatenates sizes of student and teacher output tensors for inter-process communication. """ if ( - forward_only - or parallel_state.get_pipeline_model_parallel_world_size() == 1 + parallel_state.get_pipeline_model_parallel_world_size() == 1 or parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None ): return None @@ -584,20 +575,10 @@ def adjust_tensor_shapes( cp_group = parallel_state.get_context_parallel_group() teacher_recv_tensor_shapes = get_tensor_shapes( - seq_length=seq_length, - micro_batch_size=micro_batch_size, - decoder_seq_length=decoder_seq_length, - config=teacher_config, - tp_group=tp_group, - cp_group=cp_group, + config=teacher_config, tp_group=tp_group, cp_group=cp_group, **kwargs ) teacher_send_tensor_shapes = get_tensor_shapes( - seq_length=seq_length, - micro_batch_size=micro_batch_size, - decoder_seq_length=decoder_seq_length, - config=teacher_config, - tp_group=tp_group, - cp_group=cp_group, + config=teacher_config, tp_group=tp_group, cp_group=cp_group, **kwargs ) model.set_student_input_tensor_shape(recv_tensor_shapes) From b5042961c60bc77b4ac118bc24d19ae89d4a9a00 Mon Sep 17 00:00:00 2001 From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:52:37 -0700 Subject: [PATCH 22/27] Deprecate TRTLLM-build in examples (#297) Signed-off-by: Chenjie Luo Signed-off-by: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Co-authored-by: Chenjie Luo Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: Ye Yu --- .github/CODEOWNERS | 1 - CHANGELOG.rst | 10 +- .../_installation_for_Linux.rst | 2 +- examples/gpt-oss/requirements.txt | 2 +- examples/llm_eval/README.md | 6 +- examples/llm_ptq/README.md | 8 +- examples/llm_ptq/hf_ptq.py | 117 +++---- examples/llm_ptq/modelopt_to_tensorrt_llm.py | 310 ----------------- examples/llm_ptq/run_tensorrt_llm.py | 5 +- .../llm_ptq/scripts/huggingface_example.sh | 219 +++--------- examples/llm_ptq/scripts/parser.sh | 32 +- .../notebooks/QAT_QAD_Walkthrough.ipynb | 2 +- examples/llm_sparsity/README.md | 3 +- examples/vlm_eval/README.md | 42 --- examples/vlm_eval/convert_gqa_for_eval.py | 35 -- examples/vlm_eval/gqa.sh | 147 -------- examples/vlm_eval/model_gqa_loader.py | 325 ------------------ examples/vlm_eval/requirements-vila.txt | 2 - examples/vlm_eval/vlm_eval_utils.py | 44 --- examples/vlm_ptq/README.md | 2 +- .../vlm_ptq/scripts/huggingface_example.sh | 192 ++--------- examples/vlm_ptq/vlm_run.py | 128 ------- examples/vlm_ptq/vlm_visual_engine.py | 26 -- modelopt/deploy/llm/generate.py | 202 ++++------- tests/_test_utils/examples/run_command.py | 8 +- tests/_test_utils/model.py | 5 + tests/_test_utils/ptq_utils.py | 12 +- tests/examples/llm_eval/test_llm_eval.py | 8 +- tests/examples/llm_ptq/test_llm_ptq.py | 74 +--- .../speculative_decoding/test_medusa.py | 6 +- .../{test_llava.py => test_qwen_vl.py} | 8 +- ...unified_hf_export_and_check_safetensors.py | 3 - 32 files changed, 269 insertions(+), 1717 deletions(-) delete mode 100644 examples/llm_ptq/modelopt_to_tensorrt_llm.py delete mode 100644 examples/vlm_eval/README.md delete mode 100644 examples/vlm_eval/convert_gqa_for_eval.py delete mode 100644 examples/vlm_eval/gqa.sh delete mode 100644 examples/vlm_eval/model_gqa_loader.py delete mode 100644 examples/vlm_eval/requirements-vila.txt delete mode 100644 examples/vlm_eval/vlm_eval_utils.py delete mode 100644 examples/vlm_ptq/vlm_run.py delete mode 100644 examples/vlm_ptq/vlm_visual_engine.py rename tests/examples/vlm_ptq/{test_llava.py => test_qwen_vl.py} (80%) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 1aa17ae91..924861c1d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -50,6 +50,5 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners /examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners /examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners /examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners -/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners /examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners /examples/windows @NVIDIA/modelopt-windows-codeowners diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fffda4832..6fde2bcb9 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,12 +5,17 @@ Model Optimizer Changelog (Linux) ^^^^^^^^^^^^^^^^^ **Deprecations** -- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. -**Bug Fixes** +- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. +- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly. +- ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format. +- ``int8_sq`` quantization format is deprecated from the ``examples/vlm_ptq`` with respect to the TensorRT-LLM's torch backend switch. Please refer to the previous releases if this quantization format is needed. +- Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend. **New Features** + - ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default. +- Upgrade TensorRT-LLM dependency to 1.1.0rc2. 0.35 (2025-09-04) ^^^^^^^^^^^^^^^^^ @@ -23,7 +28,6 @@ Model Optimizer Changelog (Linux) **Bug Fixes** - Fix attention head ranking logic for pruning Megatron Core GPT models. -- Upgrade TensorRT-LLM dependency to 1.1.0rc2. **New Features** diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst index 24ae8ffec..42892966a 100644 --- a/docs/source/getting_started/_installation_for_Linux.rst +++ b/docs/source/getting_started/_installation_for_Linux.rst @@ -18,7 +18,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system +-------------------------+-----------------------------+ | PyTorch | >=2.6 | +-------------------------+-----------------------------+ -| TensorRT-LLM (Optional) | 1.0.0rc6 | +| TensorRT-LLM (Optional) | 1.1.0rc2.post2 | +-------------------------+-----------------------------+ | ONNX Runtime (Optional) | 1.22 | +-------------------------+-----------------------------+ diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index dead5cb74..4d75b59c3 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -3,7 +3,7 @@ datasets deepspeed kernels>=0.9.0 peft>=0.17.0 -torch >= 2.8.0 +torch>2.7.1 trackio transformers>=4.55.0 trl>=0.21.0 diff --git a/examples/llm_eval/README.md b/examples/llm_eval/README.md index a73495961..0e1855d99 100644 --- a/examples/llm_eval/README.md +++ b/examples/llm_eval/README.md @@ -93,7 +93,7 @@ If `trust_remote_code` needs to be true, please append the command with the `--t ### TensorRT-LLM ```sh -python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=,engine_dir= --tasks --batch_size +python lm_eval_tensorrt_llm.py --model trt-llm --model_args tokenizer=,engine_dir= --tasks --batch_size ``` ## MMLU @@ -140,7 +140,7 @@ python mmlu.py --model_name causal --model_path ### Evaluate the TensorRT-LLM engine ```bash -python mmlu.py --model_name causal --model_path --engine_dir +python mmlu.py --model_name causal --model_path --engine_dir ``` ## MT-Bench @@ -163,7 +163,7 @@ bash run_fastchat.sh -h --quant_cfg MODELOPT_QUA ### Evaluate the TensorRT-LLM engine ```bash -bash run_fastchat.sh -h +bash run_fastchat.sh -h ``` ### Judging the responses diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 833412619..2c95b8316 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -203,7 +203,7 @@ scripts/huggingface_example.sh --type llama --model $HF_PATH --quant w4a8_awq,fp The above example perform `AutoQuantize` where the less quantization accuracy sensitive layers are quantized with `w4a8_awq` (specified by `--quant w4a8_awq`) and the more sensitive layers are kept un-quantized such that the effective bits is 4.8 (specified by `--auto_quantize_bits 4.8`). -The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `build,mmlu,benchmark,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks). +The example scripts above also have an additional flag `--tasks`, where the actual tasks run in the script can be customized. The allowed tasks are `quant,mmlu,lm_eval,livecodebench` specified in the script [parser](./scripts/parser.sh). The tasks combo can be specified with a comma-separated task list. Some tasks like mmlu can take a long time to run. To run lm_eval tasks, please also specify the `--lm_eval_tasks` flag with comma separated lm_eval tasks [here](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks). > *If GPU out-of-memory error is reported running the scripts, please try editing the scripts and reducing the max batch size to save GPU memory.* @@ -251,7 +251,7 @@ scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|int8_sq|int4_ > *If a GPU OOM error occurs during model quantization despite sufficient memory, setting the --use_seq_device_map flag can help. This enforces sequential device mapping, distributing the model across GPUs and utilizing up to 80% of each GPU's memory.* -> *You can now add `--low_memory_mode` to the command when setting `--export_fmt=hf` to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.* +> *You can add `--low_memory_mode` to the command to lower the memory requirements of the PTQ process. With this mode, the script will compress model weights to low precision before calibration. This mode is only supported for FP8 and NVFP4 with max calibration.* #### Deepseek R1 @@ -301,7 +301,7 @@ with torch.inference_mode(): ### Quantize and Export ```bash -python hf_ptq.py --pyt_ckpt_path --qformat fp8 --export_fmt hf --export_path --trust_remote_code +python hf_ptq.py --pyt_ckpt_path --qformat fp8 --export_path --trust_remote_code ``` ### Hugging Face framework [Script](./scripts/huggingface_example.sh) @@ -309,7 +309,7 @@ python hf_ptq.py --pyt_ckpt_path --qformat fp8 --export Alternatively, the framework script `huggingface_example.sh` also supports quantize and export: ```bash -scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf +scripts/huggingface_example.sh --model --quant fp8 ``` ### Deployment diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 81f4b6392..119a34f21 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -89,28 +89,20 @@ def auto_quantize( qformat_list = qformat.split(",") assert qformat_list, "No quantization formats provided" # Check if all provided quantization formats are supported - if args.export_fmt == "hf": - assert all( - qformat - in [ - "fp8", - "int4_awq", - "nvfp4", - "nvfp4_awq", - "w4a8_awq", - "fp8_pb_wo", - "w4a8_mxfp4_fp8", - "nvfp4_mlp_only", - ] - for qformat in qformat_list - ), ( - "One or more quantization formats provided are not supported for unified checkpoint export" - ) - else: - assert all( - qformat in ["fp8", "int8_sq", "int4_awq", "w4a8_awq", "nvfp4", "nvfp4_awq"] - for qformat in qformat_list - ), "One or more quantization formats provided are not supported for tensorrt llm export" + assert all( + qformat + in [ + "fp8", + "int4_awq", + "nvfp4", + "nvfp4_awq", + "w4a8_awq", + "fp8_pb_wo", + "w4a8_mxfp4_fp8", + "nvfp4_mlp_only", + ] + for qformat in qformat_list + ), "One or more quantization formats provided are not supported for unified checkpoint export" def loss_func(output, data): # For transformers AutoModelForCausalLM models, the outputs are wrapped in `CausalLMOutputWithPast` @@ -219,27 +211,21 @@ def main(args): "Quantization supports only one quantization format." ) - # Check arguments for unified_hf export format and set to default if unsupported arguments are provided - if args.export_fmt == "hf": - assert args.sparsity_fmt == "dense", ( - f"Sparsity format {args.sparsity_fmt} not supported by unified export api." - ) - - if not args.auto_quantize_bits: - assert ( - args.qformat - in [ - "int4_awq", - "fp8", - "nvfp4", - "nvfp4_awq", - "w4a8_awq", - "fp8_pb_wo", - "w4a8_mxfp4_fp8", - "nvfp4_mlp_only", - ] - or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES - ), f"Quantization format {args.qformat} not supported for HF export path" + if not args.auto_quantize_bits: + assert ( + args.qformat + in [ + "int4_awq", + "fp8", + "nvfp4", + "nvfp4_awq", + "w4a8_awq", + "fp8_pb_wo", + "w4a8_mxfp4_fp8", + "nvfp4_mlp_only", + ] + or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES + ), f"Quantization format {args.qformat} not supported for HF export path" # If low memory mode is enabled, we compress the model while loading the HF checkpoint. calibration_only = False @@ -253,9 +239,6 @@ def main(args): attn_implementation=args.attn_implementation, ) else: - assert args.export_fmt == "hf", ( - "Low memory mode is only supported for exporting HF checkpoint." - ) assert args.qformat in QUANT_CFG_CHOICES, ( f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" ) @@ -600,34 +583,41 @@ def output_decode(generated_ids, input_shape): setattr(model.config, "architectures", full_model_config.architectures) start_time = time.time() - if args.export_fmt == "tensorrt_llm": + if ( + model_type in ["t5", "bart", "whisper"] + or args.sparsity_fmt != "dense" + or "int8_sq" in args.qformat + ): + warnings.warn( + "Still exporting TensorRT-LLM checkpoints for models not supported by the TensorRT-LLM torch runtime." + ) + # Move meta tensor back to device before exporting. remove_hook_from_module(model, recurse=True) - dtype = None - if "w4a8_awq" in args.qformat: - # TensorRT-LLM w4a8 only support fp16 as the dtype. - dtype = torch.float16 - - # For Gemma2-27B, TRT-LLM only works with bfloat16 as the dtype. - if model_type == "gemma2": - dtype = torch.bfloat16 - export_tensorrt_llm_checkpoint( model, model_type, - dtype=dtype, export_dir=export_path, inference_tensor_parallel=args.inference_tensor_parallel, inference_pipeline_parallel=args.inference_pipeline_parallel, ) - elif args.export_fmt == "hf": + else: + # Check arguments for unified_hf export format and set to default if unsupported arguments are provided + assert args.sparsity_fmt == "dense", ( + f"Sparsity format {args.sparsity_fmt} not supported by unified export api." + ) + + if args.inference_tensor_parallel != 1 or args.inference_pipeline_parallel != 1: + warnings.warn( + "Unified HF export format does not specify inference tensor parallel or pipeline parallel. " + "They will be set at deployment time." + ) + export_hf_checkpoint( full_model, export_dir=export_path, ) - else: - raise NotImplementedError(f"{args.export_fmt} not supported") # Restore default padding and export the tokenizer as well. if tokenizer is not None: @@ -710,9 +700,9 @@ def output_decode(generated_ids, input_shape): parser.add_argument( "--export_fmt", required=False, - default="tensorrt_llm", + default="hf", choices=["tensorrt_llm", "hf"], - help=("Checkpoint export format"), + help="Deprecated. Please avoid using this argument.", ) parser.add_argument( "--trust_remote_code", @@ -767,6 +757,9 @@ def output_decode(generated_ids, input_shape): args = parser.parse_args() + if args.export_fmt != "hf": + warnings.warn("Deprecated. --export_fmt forced to hf.") + args.dataset = args.dataset.split(",") if args.dataset else None args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")] main(args) diff --git a/examples/llm_ptq/modelopt_to_tensorrt_llm.py b/examples/llm_ptq/modelopt_to_tensorrt_llm.py deleted file mode 100644 index e3e1ea872..000000000 --- a/examples/llm_ptq/modelopt_to_tensorrt_llm.py +++ /dev/null @@ -1,310 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""An example to convert an Model Optimizer exported model to tensorrt_llm.""" - -import argparse -import subprocess -import warnings -from pathlib import Path - -import tensorrt_llm -import torch -from packaging import version -from packaging.version import parse -from tensorrt_llm.llmapi import BuildConfig -from tensorrt_llm.models import PretrainedConfig -from transformers import AutoTokenizer - -try: - # run depends on features from the min-supported TensorRT-LLM - from run_tensorrt_llm import run -except Exception as e: - warnings.warn(f"Cannot run TensorRT-LLM inference: {e}") - run = None - - -MIN_TENSORRT_LLM_VERSION = "0.13.0" - - -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "f", "n", "0"): - return False - else: - raise argparse.ArgumentTypeError("Boolean value expected.") - - -def build_tensorrt_llm( - pretrained_config: str | Path, - engine_dir: str | Path, - max_input_len: int = 200, - max_output_len: int = 200, - max_batch_size: int = 1, - max_beam_width: int = 1, - max_num_tokens: int | None = None, - num_build_workers: int = 1, - enable_sparsity: bool = False, - max_prompt_embedding_table_size: int = BuildConfig.max_prompt_embedding_table_size, - max_encoder_input_len: int = BuildConfig.max_encoder_input_len, - perf_mode: bool = False, -): - """The API to convert the TensorRT-LLM checkpoint to engines. - - Args: - pretrained_config: The pretrained_config (file path) exported by - ``modelopt.torch.export.export_tensorrt_llm_checkpoint``. - engine_dir: The target output directory to save the built tensorrt_llm engines. - max_input_len: The max input sequence length. - max_output_len: The max output sequence length. - max_batch_size: The max batch size. - max_beam_width: The max beam search width. - max_num_tokens: The max number of tokens that can be processed at the same time. - For the context phase, the max_num_tokens counts the full sequence length. - For the generation phase, the max_num_tokens counts only the ones under generation - as the input sequence has been processed as cached. - max_num_tokens should fall between [max_batch_size * max_beam_width, max_batch_size * max_input_len]. - when inflight batching is enabled. - Higher max_num_tokens means more GPU memory will be used for resource allocation. - If not specified the max_num_tokens will be set to the max bound. - Details: https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html - num_build_workers: The number of workers to use for the building process. - If build time is a concern, you can increase this worker count to num of GPUs. - At a lost of higher CPU memory usage footprint. - If CPU memory is limited, num_build_workers should be set to 1 to conserve memory. - enable_sparsity: The switch to enable sparsity for TRT compiler. - With this flag, the TRT compiler will search tactics of sparse kernels for each node of which - weight tensors are sparsified. This increases engine building time significantly. - max_prompt_embedding_table_size: Length of the prepended/concatenated embeddings (either multimodal - feature embeddings or prompt tuning embeddings) to the LLM input embeddings. - max_encoder_input_len: Maximum encoder input length for enc-dec models. - perf_mode: Whether build the engine with max perf at a cost of longer build time and less flexibility. - checkpoint_format: The model checkpoint format. Choose between [tensorrt_llm, hf]. - tp: tensor_parallel_size. Effective for hf checkpoint_format only. - """ - engine_dir = Path(engine_dir) - engine_dir.mkdir(parents=True, exist_ok=True) - - pretrained_config_path = Path(pretrained_config) - assert pretrained_config_path.exists() - ckpt_dir = pretrained_config_path.parent - - timing_cache_file = ( - torch.cuda.get_device_name().replace(" ", "_") - + "_trtllm_" - + tensorrt_llm.__version__ - + ".cache" - ) - timing_cache_path = engine_dir / timing_cache_file - - if not max_num_tokens: - # tensorrt-llm recommends max max_num_tokens to be 16384 - max_num_tokens = min(max_batch_size * max_input_len, 16384) - - config = PretrainedConfig.from_json_file(pretrained_config_path) - - log_level = "warning" - - use_paged_context_fmha = config.quantization.quant_algo in [ - "FP8", - "W4A8_AWQ", - "NVFP4", - None, - ] - - # FP8 FMHA for gemma is not supported in tensorrt_llm < 0.19.0 - if "GemmaForCausalLM" in config.architecture and version.parse( - tensorrt_llm.__version__ - ) < version.parse("0.19.0"): - use_paged_context_fmha = False - - use_fused_mlp = "RecurrentGemma" not in config.architecture - if config.quantization.exclude_modules: - for module_name in config.quantization.exclude_modules: - # fp8_context_fhma requires all attention.dense to be quantized - if "attention.dense" in module_name: - use_paged_context_fmha = False - # For AutoQuant, fc and gate might not be quantized at the same time - # TODO: relax this limitation on the TRT-LLM side - if "gate" in module_name or "fc" in module_name: - use_fused_mlp = False - - quant_algo = config.quantization.quant_algo - use_qdq = quant_algo in ["FP8", "W8A8_SQ_PER_CHANNEL"] - - speculative_decoding_mode = "medusa" if "Medusa" in config.architecture else None - - if num_build_workers > torch.cuda.device_count(): - num_build_workers = torch.cuda.device_count() - print(f"Cap num_build_workers to num gpus: ${num_build_workers}") - - build_cmd = "trtllm-build " - build_cmd += f"--checkpoint_dir {ckpt_dir} " - build_cmd += f"--input_timing_cache {timing_cache_path} " - build_cmd += f"--output_timing_cache {timing_cache_path} " - build_cmd += f"--log_level {log_level} " - build_cmd += f"--output_dir {engine_dir} " - build_cmd += f"--workers {num_build_workers} " - build_cmd += f"--max_batch_size {max_batch_size} " - build_cmd += f"--max_input_len {max_input_len} " - build_cmd += f"--max_seq_len {max_output_len + max_input_len} " - build_cmd += f"--max_beam_width {max_beam_width} " - build_cmd += f"--max_prompt_embedding_table_size {max_prompt_embedding_table_size} " - build_cmd += f"--max_encoder_input_len {max_encoder_input_len} " - build_cmd += ( - "--reduce_fusion enable " - if config.mapping.pp_size == 1 - and config.architecture - not in [ - "DbrxForCausalLM", - "BaichuanForCausalLM", - "QWenForCausalLM", - "GPTForCausalLM", - ] - else "" - ) - - if use_fused_mlp: - build_cmd += "--use_fused_mlp enable " - else: - build_cmd += "--use_fused_mlp disable " - - if enable_sparsity: - build_cmd += "--weight_sparsity " - - # Low batch size scenario - if max_batch_size <= 4 and quant_algo == "FP8": - build_cmd += "--gemm_plugin fp8 " - if quant_algo == "NVFP4": - build_cmd += "--gemm_plugin nvfp4 " - elif not use_qdq: - build_cmd += "--gemm_plugin auto " - - build_cmd += f"--max_num_tokens {max_num_tokens} " - - if speculative_decoding_mode: - build_cmd += f"--speculative_decoding_mode {speculative_decoding_mode} " - - if use_paged_context_fmha: - build_cmd += "--use_paged_context_fmha enable " - - if perf_mode: - build_cmd += "--multiple_profiles enable" - elif not speculative_decoding_mode: - build_cmd += "--gather_context_logits " # for evaluation benchmarking purpose - - print(f"trtllm-build command:\n{build_cmd}") - - assert parse(tensorrt_llm.__version__) >= parse(MIN_TENSORRT_LLM_VERSION), ( - f"Detected lower version of tensorrt_llm installed instead of {MIN_TENSORRT_LLM_VERSION}. " - f"Please build the tensorrt_llm engines with tensorrt_llm version {MIN_TENSORRT_LLM_VERSION} " - " or higher instead.\n\n Build command: {build_cmd}" - ) - subprocess.run(build_cmd, shell=True, check=True) - - try: - tokenizer = AutoTokenizer.from_pretrained(ckpt_dir) - tokenizer.save_pretrained(engine_dir) - except Exception as e: - warnings.warn(f"Cannot copy tokenizer to the engine dir. {e}") - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_config", type=str, default="") - parser.add_argument("--max_output_len", type=int, default=512) - parser.add_argument("--max_input_len", type=int, default=2048) - parser.add_argument("--max_batch_size", type=int, default=8) - parser.add_argument("--max_num_beams", type=int, default=1) - parser.add_argument( - "--perf", - action="store_true", - help="Build engines for max perf benchmark", - ) - parser.add_argument("--engine_dir", type=str, default="/tmp/modelopt") - parser.add_argument("--tokenizer", type=str, default="") - parser.add_argument( - "--input_texts", - type=str, - default=( - "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a" - ), - help="Input texts. Please use | to separate different batches.", - ) - parser.add_argument("--num_build_workers", type=int, default="1") - parser.add_argument("--enable_sparsity", type=str2bool, default=False) - parser.add_argument( - "--max_prompt_embedding_table_size", - "--max_multimodal_len", - type=int, - default=BuildConfig.max_prompt_embedding_table_size, - help="Maximum prompt embedding table size for prompt tuning, " - "or maximum multimodal input size for multimodal models.", - ) - parser.add_argument( - "--max_encoder_input_len", - type=int, - default=BuildConfig.max_encoder_input_len, - help="Maximum encoder input length for enc-dec models.", - ) - parser.add_argument( - "--trust_remote_code", - help="Set trust_remote_code for Huggingface models and tokenizers", - default=False, - action="store_true", - ) - parser.add_argument( - "--skip_run", - help="Skip the inference run", - default=False, - action="store_true", - ) - - return parser.parse_args() - - -def main(args): - build_tensorrt_llm( - pretrained_config=args.model_config, - engine_dir=args.engine_dir, - max_input_len=args.max_input_len, - max_output_len=args.max_output_len, - max_batch_size=args.max_batch_size, - max_beam_width=args.max_num_beams, - num_build_workers=args.num_build_workers, - enable_sparsity=args.enable_sparsity, - max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - max_encoder_input_len=args.max_encoder_input_len, - perf_mode=args.perf, - ) - - if ( - args.model_config is not None - and all(model_name not in args.model_config for model_name in ("vila", "llava")) - and run is not None - ): - # Reduce output_len for the inference run example. - args.max_output_len = 100 - - if not args.skip_run: - run(args) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) diff --git a/examples/llm_ptq/run_tensorrt_llm.py b/examples/llm_ptq/run_tensorrt_llm.py index a414496aa..c3152959a 100644 --- a/examples/llm_ptq/run_tensorrt_llm.py +++ b/examples/llm_ptq/run_tensorrt_llm.py @@ -80,9 +80,8 @@ def run(args): outputs = llm.generate_tokens(input_texts, args.max_output_len) print(f"Generated tokens: {outputs}") - if llm.gather_context_logits: - logits = llm.generate_context_logits(input_texts) - print(f"Generated logits: {logits}") + logits = llm.generate_context_logits(input_texts) + print(f"Generated logits: {logits}") if __name__ == "__main__": diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 8f18c5ac8..8878b824c 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -34,27 +34,6 @@ if [ -z "$MODEL_PATH" ]; then exit 1 fi -#Check if arguments are supported by HF export path -if [ "$EXPORT_FORMAT" = "hf" ]; then - if [ "$SPARSITY_FMT" != "dense" ]; then - echo "Unsupported sparsity argument: Expected dense" >&2 - exit 1 - fi - - #Iterate over list of qformats provided and check if they are supported in HF export path - IFS="," - for qformat in $QFORMAT; do - case $qformat in - fp16 | bf16 | fp8 | fp8_pc_pt | fp8_pb_wo | int4_awq | nvfp4 | nvfp4_awq | w4a8_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8) ;; - *) - echo "Unsupported quant argument: Expected one of: [fp16, bf16, fp8, fp8_pc_pt, fp8_pb_wo, int4_awq, nvfp4, nvfp4_awq, w4a8_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8]" >&2 - exit 1 - ;; - esac - done - IFS=" " -fi - # Check if ENABLE_SPARSITY environment variable is set to "true" if [ "$SPARSITY_FMT" = "dense" ]; then ENABLE_SPARSITY=false @@ -83,26 +62,6 @@ for qformat in $QFORMAT; do done IFS=" " -case $TP in -1 | 2 | 4 | 8) ;; -*) - echo "Unknown tp argument: Expected one of: [1, 2, 4, 8]" >&2 - exit 1 - ;; -esac - -case $PP in -1 | 2 | 4 | 8) ;; -*) - echo "Unknown pp argument: Expected one of: [1, 2, 4, 8]" >&2 - exit 1 - ;; -esac - -GPU_NAME=$(nvidia-smi --id 0 --query-gpu=name --format=csv,noheader,nounits | sed 's/ /_/g') - -echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" - script_dir="$(dirname "$(readlink -f "$0")")" pushd $script_dir/.. @@ -113,24 +72,13 @@ fi QFORMAT_MODIFIED="${QFORMAT//,/_}" -MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g') - -MODEL_FULL_NAME=${MODEL_NAME}_${SPARSITY_FMT}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_tp${TP}_pp${PP} +MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} -if [ $EXPORT_FORMAT != "tensorrt_llm" ]; then - MODEL_FULL_NAME=${MODEL_NAME}_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}_${EXPORT_FORMAT} -fi - -SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_FULL_NAME} +SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME} MODEL_CONFIG=${SAVE_PATH}/config.json -ENGINE_DIR=${SAVE_PATH}/${TP}x${PP}x${GPU_NAME}_input${BUILD_MAX_INPUT_LEN}_output${BUILD_MAX_OUTPUT_LEN}_batch${BUILD_MAX_BATCH_SIZE}_engine -if [ $EXPORT_FORMAT = "hf" ]; then - ENGINE_DIR=${SAVE_PATH} -fi - -mkdir -p $ENGINE_DIR +mkdir -p $SAVE_PATH if [ "${REMOVE_EXISTING_MODEL_CONFIG,,}" = "true" ]; then rm -f $MODEL_CONFIG @@ -180,14 +128,13 @@ else MODEL_CONFIG_EXIST=false fi -if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_DIR) ]]; then +if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then - if [ "$EXPORT_FORMAT" == "hf" ] && ([ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]); then + if [ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]; then if [ -d "$MODEL_PATH" ]; then MODEL_CONFIG_EXIST=true MODEL_CONFIG=$MODEL_PATH/config.json - mkdir -p $ENGINE_DIR - for file in $MODEL_PATH/*; do ln -sf "$file" $ENGINE_DIR/; done + for file in $MODEL_PATH/*; do ln -sf "$file" $SAVE_PATH/; done else echo "Please use the model directory where the config.json file is present." exit 1 @@ -205,7 +152,6 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D --batch_size=$CALIB_BATCH_SIZE \ --inference_tensor_parallel=$TP \ --inference_pipeline_parallel=$PP \ - --export_fmt=$EXPORT_FORMAT \ $PTQ_ARGS \ $AWQ_ARGS else @@ -218,46 +164,30 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D exit 0 fi + if [[ "$SPARSITY_FMT" != "dense" ]]; then + echo "Sparse quantization detected (SPARSITY_FMT=$SPARSITY_FMT). Please deploy with the TRT-LLM using trtllm-build. Checkpoint export_path: $SAVE_PATH" + exit 0 + fi + if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1) if [ "$cuda_major" -lt 10 ]; then - echo "Please build the tensorrt_llm engine on Blackwell GPU for deployment. Checkpoint export_path: $SAVE_PATH" + echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH" exit 0 fi fi - if [ $EXPORT_FORMAT = "tensorrt_llm" ]; then - echo "Building tensorrt_llm engine from Model Optimizer-quantized model..." - - BUILD_ARGS="" - if [[ $TASKS =~ "benchmark" && ! $TASKS =~ "lm_eval" ]]; then - BUILD_ARGS+=" --perf " - fi - - python modelopt_to_tensorrt_llm.py \ - --model_config=$MODEL_CONFIG \ - --engine_dir=$ENGINE_DIR \ - --tokenizer=$MODEL_PATH \ - --max_input_len=$BUILD_MAX_INPUT_LEN \ - --max_output_len=$BUILD_MAX_OUTPUT_LEN \ - --max_batch_size=$BUILD_MAX_BATCH_SIZE \ - --num_build_workers=$GPUS \ - --enable_sparsity=$ENABLE_SPARSITY \ - $BUILD_ARGS - else - - if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then - echo "Quant $QFORMAT not supported with the TensorRT-LLM torch llmapi. Allowed values are: fp8, nvfp4, bf16, fp16" - exit 0 - fi - - if $TRUST_REMOTE_CODE; then - RUN_ARGS+=" --trust_remote_code " - fi + if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then + echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH" + exit 0 + fi - python run_tensorrt_llm.py --engine_dir=$ENGINE_DIR $RUN_ARGS + if $TRUST_REMOTE_CODE; then + RUN_ARGS+=" --trust_remote_code " fi + + python run_tensorrt_llm.py --engine_dir=$SAVE_PATH $RUN_ARGS fi if [[ -d "${MODEL_PATH}" ]]; then @@ -288,16 +218,18 @@ if [[ $TASKS =~ "lm_eval" ]]; then lm_eval_flags+=" --trust_remote_code " fi - LM_EVAL_RESULT=${ENGINE_DIR}/lm_eval.txt + LM_EVAL_RESULT=${SAVE_PATH}/lm_eval.txt echo "Evaluating lm_eval, result saved to $LM_EVAL_RESULT..." pushd ../llm_eval/ pip install -r requirements.txt + echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" + python lm_eval_tensorrt_llm.py \ --model trt-llm \ - --model_args tokenizer=$MODEL_PATH,engine_dir=$ENGINE_DIR,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \ + --model_args tokenizer=$MODEL_PATH,engine_dir=$SAVE_PATH,max_gen_toks=$BUILD_MAX_OUTPUT_LEN \ --tasks $LM_EVAL_TASKS \ --batch_size $BUILD_MAX_BATCH_SIZE $lm_eval_flags | tee $LM_EVAL_RESULT @@ -307,7 +239,7 @@ fi if [[ $TASKS =~ "mmlu" ]]; then - MMLU_RESULT=${ENGINE_DIR}/mmlu.txt + MMLU_RESULT=${SAVE_PATH}/mmlu.txt echo "Evaluating MMLU, result saved to $MMLU_RESULT..." pushd ../llm_eval/ @@ -327,7 +259,7 @@ if [[ $TASKS =~ "mmlu" ]]; then python mmlu.py \ --model_name causal \ --model_path $MODEL_ABS_PATH \ - --engine_dir $ENGINE_DIR \ + --engine_dir $SAVE_PATH \ --data_dir $MMLU_DATA_PATH | tee $MMLU_RESULT popd @@ -337,10 +269,10 @@ if [[ $TASKS =~ "mtbench" ]]; then pushd ../llm_eval/ - bash run_fastchat.sh -h $MODEL_ABS_PATH -e $ENGINE_DIR - find data/mt_bench/model_answer/ -type f -name '*.jsonl' -exec mv {} $ENGINE_DIR \; + bash run_fastchat.sh -h $MODEL_ABS_PATH -e $SAVE_PATH + find data/mt_bench/model_answer/ -type f -name '*.jsonl' -exec mv {} $SAVE_PATH \; - JSONL_PATH=$(readlink -f $(find $ENGINE_DIR -type f -name '*.jsonl')) + JSONL_PATH=$(readlink -f $(find $SAVE_PATH -type f -name '*.jsonl')) echo "FastChat generation complete. The results are saved under $JSONL_PATH . Please run the judge(https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to evaluate the quality of the responses." popd @@ -350,13 +282,13 @@ fi if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then # Clean a previous session if exists pkill -f "trtllm-serve" && while pgrep -f "trtllm-serve" >/dev/null; do sleep 1; done - HASH=$(echo -n "$ENGINE_DIR" | md5sum | awk '{print $1}') + HASH=$(echo -n "$SAVE_PATH" | md5sum | awk '{print $1}') PORT=$((10000 + (0x${HASH:0:4} % 50001))) echo "Starting trtllm-serve on $PORT" - trtllm-serve $ENGINE_DIR --host 0.0.0.0 --port $PORT >$ENGINE_DIR/serve.txt 2>&1 & + trtllm-serve $SAVE_PATH --host 0.0.0.0 --port $PORT >$SAVE_PATH/serve.txt 2>&1 & SERVE_PID=$! - tail -f $ENGINE_DIR/serve.txt | while read line; do + tail -f $SAVE_PATH/serve.txt | while read line; do if echo "$line" | grep -q "Application startup complete"; then echo "Application startup complete." break @@ -370,16 +302,18 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then pushd ../llm_eval/ if [[ $TASKS =~ "livecodebench" ]]; then - bash run_livecodebench.sh $MODEL_FULL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $ENGINE_DIR/livecodebench.txt - mkdir -p $ENGINE_DIR/livecodebench - mv LiveCodeBench/output/$MODEL_FULL_NAME/* $ENGINE_DIR/livecodebench - echo "LiveCodeBench results are saved under $ENGINE_DIR/livecodebench." + echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" + bash run_livecodebench.sh $MODEL_NAME $BUILD_MAX_BATCH_SIZE $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/livecodebench.txt + mkdir -p $SAVE_PATH/livecodebench + mv LiveCodeBench/output/$MODEL_NAME/* $SAVE_PATH/livecodebench + echo "LiveCodeBench results are saved under $SAVE_PATH/livecodebench." fi if [[ $TASKS =~ "simple_eval" ]]; then - bash run_simple_eval.sh $MODEL_FULL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $ENGINE_DIR/simple_eval.txt - echo "Simple eval results are saved under $ENGINE_DIR/simple_eval.txt." + echo "Using the following config: max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" + bash run_simple_eval.sh $MODEL_NAME $SIMPLE_EVAL_TASKS $BUILD_MAX_OUTPUT_LEN $PORT | tee $SAVE_PATH/simple_eval.txt + echo "Simple eval results are saved under $SAVE_PATH/simple_eval.txt." fi popd @@ -387,78 +321,5 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then kill $SERVE_PID fi -if [[ $TASKS =~ "benchmark" ]]; then - - if [ -z "$perf" ]; then - echo "!!!Warning: Not building tensorrt llm with optimized perf (e.g. context logits enabled). The benchmark result might be lower than optimal perf." - echo "Please rebuild the engine and not run accuracy evals where the context logits are needed (e.g. lm_eval)." - fi - - if [ "$PP" -ne 1 ]; then - echo "Benchmark does not work with multi PP. Please run the c++ benchmark in the TensorRT-LLM repo..." - exit 1 - fi - - BENCHMARK_RESULT=${ENGINE_DIR}/benchmark.txt - echo "Evaluating performance, result saved to $BENCHMARK_RESULT..." - - # Prepare datasets for TRT-LLM benchmark - if [ -z "$TRT_LLM_CODE_PATH" ]; then - TRT_LLM_CODE_PATH=/workspace/tensorrt_llm - echo "Setting default TRT_LLM_CODE_PATH to $TRT_LLM_CODE_PATH." - fi - - # Synthesize the tokenized benchmarking dataset - TRT_LLM_PREPARE_DATASET=$TRT_LLM_CODE_PATH/benchmarks/cpp/prepare_dataset.py - - # Align with the official benchmark - BENCHMARK_INPUT_LEN=$BUILD_MAX_INPUT_LEN - BENCHMARK_OUTPUT_LEN=$BUILD_MAX_OUTPUT_LEN - BENCHMARK_NUM_REQUESTS=256 - - DATASET_TXT=${SAVE_PATH}/synthetic_${BENCHMARK_INPUT_LEN}_${BENCHMARK_OUTPUT_LEN}_${BENCHMARK_NUM_REQUESTS}.txt - - if [ -z "$TRT_LLM_PREPARE_DATASET" ]; then - echo "Unable to prepare dataset for benchmarking. Please set TRT_LLM_CODE_PATH to the TRT-LLM code path." - else - if ! [ -f $DATASET_TXT ]; then - python $TRT_LLM_PREPARE_DATASET --stdout --tokenizer $MODEL_PATH token-norm-dist \ - --input-mean $BENCHMARK_INPUT_LEN --output-mean $BENCHMARK_OUTPUT_LEN --input-stdev 0 --output-stdev 0 \ - --num-requests $BENCHMARK_NUM_REQUESTS >$DATASET_TXT - else - echo "Use existing benchmark dataset in $DATASET_TXT." - fi - fi - - MODEL_ARGS="" - EXTRA_ARGS="" - if [ "$EXPORT_FORMAT" = "hf" ]; then - MODEL_ARGS="--model_path $ENGINE_DIR " - EXTRA_ARGS="--backend pytorch " - if [ "$TP" -ne 1 ]; then - EXTRA_ARGS+="--tp $TP " - fi - if [ "$PP" -ne 1 ]; then - EXTRA_ARGS+="--pp $PP " - fi - else - EXTRA_ARGS="--engine_dir $ENGINE_DIR " - fi - - if [ "$BUILD_MAX_BATCH_SIZE" -gt 1 ]; then - trtllm-bench --model $MODEL_PATH $MODEL_ARGS throughput $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT - else - trtllm-bench --model $MODEL_PATH $MODEL_ARGS latency $EXTRA_ARGS --dataset $DATASET_TXT | tee -a $BENCHMARK_RESULT - fi - -fi - -if [ -n "$FREE_SPACE" ]; then - rm -f $SAVE_PATH/*.json - rm -f $SAVE_PATH/*.safetensors - rm -f $SAVE_PATH/*/*.json - rm -f $SAVE_PATH/*/*.engine - rm -f $SAVE_PATH/*/*.cache -fi popd diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh index ea6f29a73..cd5b95466 100644 --- a/examples/llm_ptq/scripts/parser.sh +++ b/examples/llm_ptq/scripts/parser.sh @@ -18,21 +18,17 @@ # Define a function to parse command-line options parse_options() { # Default values - MODEL_TYPE="" MODEL_PATH="" QFORMAT="" KV_CACHE_QUANT="" TP=1 - CALIB_TP= PP=1 - GPUS=1 SPARSITY_FMT="dense" - EXPORT_FORMAT="tensorrt_llm" LM_EVAL_TASKS="mmlu,gsm8k" LM_EVAL_LIMIT= SIMPLE_EVAL_TASKS="mmlu" - TASKS="build" + TASKS="quant" TRUST_REMOTE_CODE=false KV_CACHE_FREE_GPU_MEMORY_FRACTION=0.8 @@ -40,32 +36,27 @@ parse_options() { USE_SEQ_DEVICE_MAP=false # Parse command-line options - ARGS=$(getopt -o "" -l "type:,model:,quant:,kv_cache_quant:,tp:,calib_tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,input:,output:,batch:,tasks:,export_fmt:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@") + ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@") eval set -- "$ARGS" while true; do case "$1" in - --type ) MODEL_TYPE="$2"; shift 2;; --model ) MODEL_PATH="$2"; shift 2;; --quant ) QFORMAT="$2"; shift 2;; --kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;; --tp ) TP="$2"; shift 2;; - --calib_tp ) CALIB_TP="$2"; shift 2;; --pp ) PP="$2"; shift 2;; --sparsity ) SPARSITY_FMT="$2"; shift 2;; --awq_block_size ) AWQ_BLOCK_SIZE="$2"; shift 2;; --calib ) CALIB_SIZE="$2"; shift 2;; --calib_batch_size ) CALIB_BATCH_SIZE="$2"; shift 2;; --auto_quantize_bits ) AUTO_QUANTIZE_BITS="$2"; shift 2;; - --input ) BUILD_MAX_INPUT_LEN="$2"; shift 2;; --output ) BUILD_MAX_OUTPUT_LEN="$2"; shift 2;; --batch ) BUILD_MAX_BATCH_SIZE="$2"; shift 2;; --tasks ) TASKS="$2"; shift 2;; - --export_fmt ) EXPORT_FORMAT="$2"; shift 2;; --lm_eval_tasks ) LM_EVAL_TASKS="$2"; shift 2;; --lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;; --simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;; - --num_samples ) NUM_SAMPLES="$2"; shift 2;; --trust_remote_code ) TRUST_REMOTE_CODE=true; shift;; --use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;; --gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;; @@ -80,7 +71,6 @@ parse_options() { DEFAULT_CALIB_SIZE=512 DEFAULT_CALIB_BATCH_SIZE=0 - DEFAULT_BUILD_MAX_INPUT_LEN=4096 DEFAULT_BUILD_MAX_OUTPUT_LEN=1024 DEFAULT_BUILD_MAX_BATCH_SIZE=2 @@ -90,9 +80,6 @@ parse_options() { if [ -z "$CALIB_BATCH_SIZE" ]; then CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE fi - if [ -z "$BUILD_MAX_INPUT_LEN" ]; then - BUILD_MAX_INPUT_LEN=$DEFAULT_BUILD_MAX_INPUT_LEN - fi if [ -z "$BUILD_MAX_OUTPUT_LEN" ]; then BUILD_MAX_OUTPUT_LEN=$DEFAULT_BUILD_MAX_OUTPUT_LEN fi @@ -103,12 +90,11 @@ parse_options() { # Verify required options are provided if [ -z "$MODEL_PATH" ] || [ -z "$QFORMAT" ] || [ -z "$TASKS" ]; then echo "Usage: $0 --model= --quant= --tasks=" - echo "Optional args: --tp= --pp= --sparsity= --awq_block_size= --calib=" - echo "Optional args for NeMo: --type= --calib_tp=" + echo "Optional args: --sparsity= --awq_block_size= --calib=" exit 1 fi - VALID_TASKS=("build" "mmlu" "mtbench" "benchmark" "lm_eval" "gqa" "livecodebench" "simple_eval") + VALID_TASKS=("quant" "mmlu" "mtbench" "lm_eval" "livecodebench" "simple_eval") for task in $(echo "$TASKS" | tr ',' ' '); do is_valid_task=false @@ -126,8 +112,6 @@ parse_options() { fi done - GPUS=$(($TP*$PP)) - # Make sparsity and int4 quantization mutually exclusive as it does not brings speedup if [[ "$SPARSITY_FMT" = "sparsegpt" || "$SPARSITY_FMT" = "sparse_magnitude" ]]; then if [[ "$QFORMAT" == *"awq"* ]]; then @@ -138,13 +122,10 @@ parse_options() { # Now you can use the variables $GPU, $MODEL, and $TASKS in your script echo "=================" - echo "type: $MODEL_TYPE" echo "model: $MODEL_PATH" echo "quant: $QFORMAT" - echo "tp: $TP" - echo "calib_tp: $CALIB_TP" - echo "pp: $PP" - echo "gpus: $GPUS" + echo "tp (TensorRT-LLM Checkpoint only): $TP" + echo "pp (TensorRT-LLM Checkpoint only): $PP" echo "sparsity: $SPARSITY_FMT" echo "awq_block_size: $AWQ_BLOCK_SIZE" echo "calib: $CALIB_SIZE" @@ -154,7 +135,6 @@ parse_options() { echo "output: $BUILD_MAX_OUTPUT_LEN" echo "batch: $BUILD_MAX_BATCH_SIZE" echo "tasks: $TASKS" - echo "export_fmt: $EXPORT_FORMAT" echo "lm_eval_tasks: $LM_EVAL_TASKS" echo "lm_eval_limit: $LM_EVAL_LIMIT" echo "simple_eval_tasks: $SIMPLE_EVAL_TASKS" diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb index bc271325b..0c292563c 100644 --- a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb +++ b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb @@ -691,7 +691,7 @@ "\n", "# run conversion script\n", "cd ..\n", - "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4 --export_fmt hf" + "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-450/ --quant nvfp4" ] }, { diff --git a/examples/llm_sparsity/README.md b/examples/llm_sparsity/README.md index 5d1f8f294..e7b8b30e0 100644 --- a/examples/llm_sparsity/README.md +++ b/examples/llm_sparsity/README.md @@ -148,5 +148,4 @@ python export_trtllm_ckpt.py --model_name_or_path meta-llama/Llama-2-7b-hf \ ## Build TensorRT-LLM Engine -For guidance on how to build TensorRT-LLM engines, please refer to [link](../llm_ptq/README.md#TensorRT-LLM-Engine-Build). -To validate the built TensorRT-LLM engines, please follow the instructions at [link](../llm_ptq/README.md#TensorRT-LLM-Engine-Validation). +For guidance on how to build TensorRT-LLM engines, please refer to [link](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-build.html#trtllm-build) and use the `--weight_sparsity` flag. diff --git a/examples/vlm_eval/README.md b/examples/vlm_eval/README.md deleted file mode 100644 index e093d79f6..000000000 --- a/examples/vlm_eval/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# Evaluation scripts for VLM tasks - -This folder includes popular 3rd-party VLM benchmarks for VLM accuracy evaluation. - -The following instructions show how to evaluate the VLM (including Model Optimizer quantized LLM) with the benchmarks, including the TensorRT-LLM deployment. - -## GQA - -[GQA: a dataset for real-world visual reasoning and compositional question answering](https://arxiv.org/abs/1902.09506). Upon completing the benchmark, the model's accuracy (in percentage format) will be displayed, providing a clear metric for performance evaluation. - -First log in to Hugging Face account with your token. - -```bash -huggingface-cli login -``` - -### Baseline - -```bash -bash gqa.sh --hf_model -``` - -### Quantized (simulated) - -```bash -# MODELOPT_QUANT_CFG: Choose from [INT8_SMOOTHQUANT_CFG|FP8_DEFAULT_CFG|INT4_AWQ_CFG|W4A8_AWQ_BETA_CFG] -bash gqa.sh --hf_model --quant_cfg MODELOPT_QUANT_CFG -``` - -### Evaluate the TensorRT-LLM engine - -TensorRT engine could be built following this [guide](../vlm_ptq/README.md) - -```bash -bash gqa.sh --hf_model --visual_engine --llm_engine -``` - -If you encounter Out of Memory (OOM) issues during evaluation, you can try lowering the `--kv_cache_free_gpu_memory_fraction` argument (default is 0.8) to reduce GPU memory usage for kv_cache: - -```bash -bash gqa.sh --hf_model --visual_engine --llm_engine --kv_cache_free_gpu_memory_fraction 0.5 -``` diff --git a/examples/vlm_eval/convert_gqa_for_eval.py b/examples/vlm_eval/convert_gqa_for_eval.py deleted file mode 100644 index 357f8a620..000000000 --- a/examples/vlm_eval/convert_gqa_for_eval.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Adapted from https://github.com/NVlabs/VILA/blob/ec7fb2c264920bf004fd9fa37f1ec36ea0942db5/scripts/convert_gqa_for_eval.py - - -import argparse -import json - -parser = argparse.ArgumentParser() -parser.add_argument("--src", type=str) -parser.add_argument("--dst", type=str) -args = parser.parse_args() - -all_answers = [] -for line_idx, line in enumerate(open(args.src)): - res = json.loads(line) - question_id = res["question_id"] - text = res["text"].rstrip(".").lower() - all_answers.append({"questionId": question_id, "prediction": text}) - -with open(args.dst, "w") as f: - json.dump(all_answers, f) diff --git a/examples/vlm_eval/gqa.sh b/examples/vlm_eval/gqa.sh deleted file mode 100644 index 5cd977a45..000000000 --- a/examples/vlm_eval/gqa.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -set -e - -# Download dataset -script_dir="$(dirname "$(readlink -f "$0")")" - -EVAL_FILE="$script_dir/eval.py" -if [ ! -f $EVAL_FILE ]; then - echo "$EVAL_FILE does not exist. Downloading this file from https://nlp.stanford.edu/data/gqa/eval.zip." - wget https://nlp.stanford.edu/data/gqa/eval.zip - unzip eval.zip "eval.py" -d . - rm eval.zip - - # Changes to eval.py due to the missing assets in GQA v1.2 release - sed -i '77s/{tier}_all_questions.json/{tier}_questions.json/' "$EVAL_FILE" - sed -i '119,120s/^/# /' "$EVAL_FILE" - sed -i '126,128s/^/# /' "$EVAL_FILE" - sed -i '367,373s/^/# /' "$EVAL_FILE" - sed -i '376,379s/^/# /' "$EVAL_FILE" - sed -i '388s/^/# /' "$EVAL_FILE" -fi - -gqa_data=$script_dir/gqa/data -QUESTION=$gqa_data/testdev_balanced_questions.json -if [ ! -f $QUESTION ]; then - echo "$QUESTION does not exist. Downloading this file from https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip." - wget -P $gqa_data https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip - unzip $gqa_data/questions1.2.zip "testdev_balanced_questions.json" -d $gqa_data - rm $gqa_data/questions1.2.zip -fi - -# Parse command-line arguments -while [[ $# -gt 0 ]]; do - case "$1" in - --hf_model) - HF_MODEL_DIR="$2" - shift 2 - ;; - --engine_dir) - ENGINE_DIR="$2" - shift 2 - ;; - --batch_size) - BATCH_SIZE="$2" - shift 2 - ;; - --quant_cfg) - QUANT_CFG="$2" - shift 2 - ;; - --kv_cache_free_gpu_memory_fraction) - KV_CACHE_FREE_GPU_MEMORY_FRACTION="$2" - shift 2 - ;; - *) - echo "Unknown option $1" - exit 1 - ;; - esac -done - -# Set default value for kv_cache_free_gpu_memory_fraction if not provided -if [ -z "$KV_CACHE_FREE_GPU_MEMORY_FRACTION" ]; then - KV_CACHE_FREE_GPU_MEMORY_FRACTION=0.8 -fi - -# Verify required arguments are set -if [ -z "$HF_MODEL_DIR" ]; then - echo "Error: Missing required argument --hf_model" - exit 1 -fi - -MODEL_NAME=$(basename $HF_MODEL_DIR | sed 's/[^0-9a-zA-Z\-]/_/g' | tr 'A-Z' 'a-z') - -if [[ "$MODEL_NAME" == *"vila"* ]] && [[ -z "$ENGINE_DIR" ]]; then - # Install required dependency for VILA - pip install -r requirements-vila.txt - # Clone original VILA repo - if [ ! -d "$(dirname "$HF_MODEL_DIR")/VILA" ]; then - echo "VILA repository is needed until it is added to HF model zoo. Cloning the repository parallel to $HF_MODEL_DIR..." - git clone https://github.com/Efficient-Large-Model/VILA.git "$(dirname "$HF_MODEL_DIR")/VILA" && \ - cd "$(dirname "$HF_MODEL_DIR")/VILA" && \ - git checkout ec7fb2c264920bf004fd9fa37f1ec36ea0942db5 && \ - cd - - fi -fi - -# Set batch size defaulted to 20 for VILA and Llava -if [[ -z "$BATCH_SIZE" && ("$MODEL_NAME" == *"vila"* || "$MODEL_NAME" == *"llava"*) ]]; then - BATCH_SIZE=20 -fi - -# Check if TRT engine is provided -if [ -z "$ENGINE_DIR" ]; then - echo "ENGINE_DIR not provided, evaluation will be based on Pytorch." - if [ -z "$QUANT_CFG" ]; then - ANSWER_DIR="$script_dir/gqa/$MODEL_NAME/llava_gqa_testdev_balanced/answers" - ANSWERS_FILE="$ANSWER_DIR/merge.jsonl" - else - ANSWER_DIR="$script_dir/gqa/${MODEL_NAME}_${QUANT_CFG}/llava_gqa_testdev_balanced/answers" - ANSWERS_FILE="$ANSWER_DIR/merge.jsonl" - fi -else - echo "Both --visual_engine or --llm_engine are provided, evaluation will be based on TRT engine." - ANSWER_DIR="$script_dir/gqa/$(basename $ENGINE_DIR)/llava_gqa_testdev_balanced/answers" - ANSWERS_FILE="$ANSWER_DIR/merge.jsonl" -fi - -# Run the Python script with the parsed arguments -if [ ! -f $ANSWERS_FILE ]; then - python model_gqa_loader.py \ - --answers_file "$ANSWERS_FILE" \ - --hf_model_dir "$HF_MODEL_DIR" \ - ${ENGINE_DIR:+--engine_dir "$ENGINE_DIR"} \ - ${BATCH_SIZE:+--batch_size "$BATCH_SIZE"} \ - ${QUANT_CFG:+--quant_cfg "$QUANT_CFG"} \ - --kv_cache_free_gpu_memory_fraction "$KV_CACHE_FREE_GPU_MEMORY_FRACTION" -fi - -# Convert answer to prediction for evaluation -PREDICTION_FILE="$ANSWER_DIR/testdev_balanced_predictions.json" -if [ ! -f $PREDICTION_FILE ]; then - python convert_gqa_for_eval.py \ - --src $ANSWERS_FILE \ - --dst $PREDICTION_FILE -fi - -# Get evaluation result -python eval.py \ - --tier "$gqa_data/testdev_balanced" \ - --predictions $PREDICTION_FILE diff --git a/examples/vlm_eval/model_gqa_loader.py b/examples/vlm_eval/model_gqa_loader.py deleted file mode 100644 index 706d0a02a..000000000 --- a/examples/vlm_eval/model_gqa_loader.py +++ /dev/null @@ -1,325 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import copy -import os -import sys -import time -from pathlib import Path - -from datasets import load_dataset -from tensorrt_llm import logger -from tensorrt_llm.runtime import MultimodalModelRunner -from tqdm import tqdm -from transformers import ( - AutoModelForCausalLM, - AutoProcessor, - GenerationConfig, - LlavaForConditionalGeneration, - MllamaForConditionalGeneration, -) - -import modelopt.torch.quantization as mtq -from modelopt.torch.utils.dataset_utils import ( - create_forward_loop, - get_dataset_dataloader, - get_max_batch_size, -) -from modelopt.torch.utils.image_processor import MllamaImageProcessor -from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader - -sys.path.append(str(Path(__file__).resolve().parent / "../llm_ptq")) -sys.path.append(str(Path(__file__).resolve().parent / "../vlm_ptq")) -from example_utils import get_processor, get_tokenizer -from utils import add_common_args -from vlm_eval_utils import save_jsonl - - -def quantize_model(model, args, tokenizer, processor=None): - sample_memory_usage_ratio = ( - 2 if "AWQ" in args.quant_cfg or "SMOOTHQUANT" in args.quant_cfg else 1.1 - ) - batch_size = get_max_batch_size(model, sample_memory_usage_ratio=sample_memory_usage_ratio) - calib_size = args.calib_size - batch_size = min(batch_size, calib_size) - - # Handle Mllama models with VLM dataset - if processor is not None and isinstance(processor, MllamaImageProcessor): - calib_dataloader = get_vlm_dataset_dataloader( - dataset_name="scienceqa", # Default dataset for Mllama - processor=processor, - batch_size=batch_size, - num_samples=calib_size, - ) - else: - calib_dataloader = get_dataset_dataloader( - dataset_name="cnn_dailymail", - tokenizer=tokenizer, - batch_size=batch_size, - num_samples=calib_size, - device=model.device, - ) - calibrate_loop = create_forward_loop(dataloader=calib_dataloader) - - quant_cfg = getattr(mtq, args.quant_cfg) - if "AWQ" in args.quant_cfg: - quant_cfg = copy.deepcopy(getattr(mtq, args.quant_cfg)) - weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"] - if isinstance(weight_quantizer, list): - weight_quantizer = weight_quantizer[0] - enable_quant_kv_cache = args.quant_cfg not in ["INT8_SMOOTHQUANT_CFG"] - print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization") - quant_cfg["quant_cfg"]["*output_quantizer"] = { - "num_bits": 8 if args.quant_cfg == "INT8_SMOOTHQUANT_CFG" else (4, 3), - "axis": None, - "enable": enable_quant_kv_cache, - } - - print("Starting quantization...") - start_time = time.time() - model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - end_time = time.time() - print(f"Quantization done. Total time used: {end_time - start_time}s") - return model - - -def main(): - parser = argparse.ArgumentParser() - parser = add_common_args(parser) - parser.add_argument("--answers_file", type=str, required=True) - parser.add_argument( - "--quant_cfg", - type=str, - default=None, - help="Specify the modelopt quantization configuration for simulated evaluation", - choices=[ - "INT8_SMOOTHQUANT_CFG", - "FP8_DEFAULT_CFG", - "INT4_AWQ_CFG", - "W4A8_AWQ_BETA_CFG", - ], - ) - parser.add_argument( - "--calib_size", type=int, default=512, help="Number of samples for calibration." - ) - parser.add_argument( - "--trust_remote_code", - help="Set trust_remote_code for Huggingface models and tokenizers", - default=False, - action="store_true", - ) - args = parser.parse_args() - - # Load data - instances = load_dataset( - "lmms-lab/GQA", "testdev_balanced_instructions", split="testdev", token=True - ) - images = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True) - id2image = {} - for row in images: - id2image[row["id"]] = row["image"].convert("RGB") - - # Load model - if args.engine_dir is not None: - os.environ["TOKENIZERS_PARALLELISM"] = "false" - logger.set_level(args.log_level) - # Load TensorRT engine - model = MultimodalModelRunner(args) - # Run batch inference - outputs = [] - batch_size = args.batch_size - if model.model_type in ["phi-3-vision"]: - # Phi-3-vision doesn't support batch inference for now - batch_size = 1 - for index in tqdm(range(0, len(instances), batch_size)): - batch = instances[index : index + batch_size] - raw_images = [id2image[imageId] for imageId in batch["imageId"]] - questions = batch["question"] - questions = [ - q + "\nAnswer the question using a single word or phrase." for q in questions - ] - if model.model_type in ["llava"]: - input_text = ["\n" + question for question in questions] - elif model.model_type in ["vila"]: - input_text = ["\n" + question for question in questions] - elif model.model_type in ["phi-3-vision"]: - input_text = questions[0] - elif model.model_type in ["mllama"]: - input_text = ["<|image|><|begin_of_text|>" + question for question in questions] - _, output_text = model.run(input_text, raw_images, None, args.max_new_tokens) - outputs.extend( - [ - { - "question_id": id, - "prompt": batch["question"][index], - "text": output_text[index][0], - } - for index, id in enumerate(batch["id"]) - ] - ) - - else: - # Load HF model - if "vila" in args.hf_model_dir.lower(): - sys.path.append(os.path.join(args.hf_model_dir, "..", "VILA")) - import llava - - model = llava.load(args.hf_model_dir) - from llava import conversation as conversation_lib - - if "8b" in args.hf_model_dir.lower(): - conv_mode = "llama_3" - elif "40b" in args.hf_model_dir.lower(): - conv_mode = "hermes-2" - else: - conv_mode = "vicuna_v1" - - conversation_lib.default_conversation = conversation_lib.conv_templates[ - conv_mode - ].copy() - - generation_config = GenerationConfig.from_pretrained(args.hf_model_dir + "/llm") - generation_config.update(max_new_tokens=args.max_new_tokens) - elif "llama" in args.hf_model_dir.lower(): - model = MllamaForConditionalGeneration.from_pretrained( - args.hf_model_dir, - device_map="auto", - trust_remote_code=args.trust_remote_code, - torch_dtype="auto", - ) - # processor = AutoProcessor.from_pretrained(args.hf_model_dir) - processor = get_processor( - args.hf_model_dir, "mllama", model.device, trust_remote_code=args.trust_remote_code - ) - - else: - processor = AutoProcessor.from_pretrained( - args.hf_model_dir, trust_remote_code=args.trust_remote_code - ) - if "llava" in args.hf_model_dir.lower(): - model = LlavaForConditionalGeneration.from_pretrained( - args.hf_model_dir, device_map="auto", torch_dtype="auto" - ) - # To be deprecated for new version transformers - processor.patch_size = model.config.vision_config.patch_size - processor.vision_feature_select_strategy = ( - model.config.vision_feature_select_strategy - ) - elif "phi" in args.hf_model_dir.lower(): - model = AutoModelForCausalLM.from_pretrained( - args.hf_model_dir, - device_map="auto", - trust_remote_code=args.trust_remote_code, - torch_dtype="auto", - _attn_implementation="flash_attention_2", - ) - else: - raise ValueError(f"Unsupported model: {args.hf_model_dir}") - # Evaluation for simulated quantization - if args.quant_cfg: - tokenizer = get_tokenizer(args.hf_model_dir, trust_remote_code=args.trust_remote_code) - if "vila" in args.hf_model_dir.lower(): - model.llm = quantize_model(model.llm, args, tokenizer) - elif "llava" in args.hf_model_dir.lower(): - model.language_model = quantize_model(model.language_model, args, tokenizer) - elif "phi" in args.hf_model_dir.lower(): - model = quantize_model(model, args, tokenizer) - elif "llama" in args.hf_model_dir.lower(): - model = quantize_model(model, args, tokenizer, processor) - else: - raise ValueError(f"Unsupported model: {args.hf_model_dir}") - if "llama" in args.hf_model_dir.lower(): - processor = processor.tokenizer - - outputs = [] - for instance in tqdm(instances): - image = id2image[instance["imageId"]] - question = instance["question"] - if "vila" in args.hf_model_dir.lower(): - response = model.generate_content( - [image, question], generation_config=generation_config - ) - else: - if "llava" in args.hf_model_dir.lower(): - conversation = [ - { - "role": "user", - "content": [ - {"type": "text", "text": question}, - {"type": "image"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - inputs = processor(images=image, text=prompt, return_tensors="pt").to( - "cuda:0", model.dtype - ) - response = model.generate( - **inputs, max_new_tokens=args.max_new_tokens, do_sample=False - ) - elif "phi" in args.hf_model_dir.lower(): - conversation = [ - {"role": "user", "content": f"<|image_1|>\n{question}"}, - ] - prompt = processor.tokenizer.apply_chat_template( - conversation, tokenize=False, add_generation_prompt=True - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to( - "cuda:0", model.dtype - ) - response = model.generate( - **inputs, - eos_token_id=processor.tokenizer.eos_token_id, - max_new_tokens=args.max_new_tokens, - do_sample=False, - ) - elif "llama" in args.hf_model_dir.lower(): - conversation = [ - { - "role": "user", - "content": [ - {"type": "image"}, - { - "type": "text", - "text": question - + "\nAnswer the question using a single word or phrase.", - }, - ], - } - ] - prompt = processor.apply_chat_template( - conversation, tokenize=False, add_generation_prompt=True - ) - - inputs = processor(image, prompt, return_tensors="pt").to("cuda:0", model.dtype) - response = model.generate( - **inputs, - eos_token_id=processor.tokenizer.eos_token_id, - max_new_tokens=args.max_new_tokens, - do_sample=False, - ) - else: - raise ValueError(f"Unsupported model: {args.hf_model_dir}") - response = processor.decode( - response[0][inputs["input_ids"].shape[-1] :], skip_special_tokens=True - ) - - outputs.append({"question_id": instance["id"], "prompt": question, "text": response}) - save_jsonl(args.answers_file, outputs) - - -if __name__ == "__main__": - main() diff --git a/examples/vlm_eval/requirements-vila.txt b/examples/vlm_eval/requirements-vila.txt deleted file mode 100644 index d88b11866..000000000 --- a/examples/vlm_eval/requirements-vila.txt +++ /dev/null @@ -1,2 +0,0 @@ -deepspeed>=0.16.0 -git+https://github.com/bfshi/scaling_on_scales.git diff --git a/examples/vlm_eval/vlm_eval_utils.py b/examples/vlm_eval/vlm_eval_utils.py deleted file mode 100644 index fdc848aec..000000000 --- a/examples/vlm_eval/vlm_eval_utils.py +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import json -import os -from collections.abc import Iterator -from contextlib import contextmanager -from io import TextIOWrapper -from typing import IO, Any, TextIO - - -@contextmanager -def file_descriptor(f: str | IO, mode: str = "r") -> Iterator[IO]: - opened = False - try: - if isinstance(f, str): - f = open(f, mode) - opened = True - yield f - finally: - if opened: - assert isinstance(f, TextIOWrapper), type(f) - f.close() - - -def save_jsonl(f: str | TextIO, obj: Any, **kwargs) -> None: - assert isinstance(f, str), type(f) - os.makedirs(os.path.dirname(f), exist_ok=True) - - with file_descriptor(f, mode="w") as fd: - fd.write("\n".join(json.dumps(datum, **kwargs) for datum in obj)) diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md index 1f51b950d..e2d79cec5 100644 --- a/examples/vlm_ptq/README.md +++ b/examples/vlm_ptq/README.md @@ -81,7 +81,7 @@ For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): ```bash git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct -scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --export_fmt hf --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] +scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] ``` The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md). diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh index 9bab141d6..69e2dce9e 100755 --- a/examples/vlm_ptq/scripts/huggingface_example.sh +++ b/examples/vlm_ptq/scripts/huggingface_example.sh @@ -29,77 +29,19 @@ for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done -case $MODEL_TYPE in - llava|phi|vila|mllama|qwen) - ;; - *) - echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama, qwen]" >&2 - exit 1 -esac - if [ -z "$MODEL_PATH" ]; then echo "Unsupported model argument: Expected a huggingface model path or model name or a nemo path" >&2 exit 1 fi -# Check if ENABLE_SPARSITY environment variable is set to "true" -if [ "$SPARSITY_FMT" = "dense" ]; then - ENABLE_SPARSITY=false -else - ENABLE_SPARSITY=true -fi - -case $SPARSITY_FMT in - dense|sparsegpt) - ;; - *) - echo "Unknown sparsity argument: Expected one of: [dense, sparsegpt]" >&2 - exit 1 -esac - case $QFORMAT in - fp8|nvfp4|int8_sq|int4_awq|w4a8_awq|fp16|bf16) - ;; - *) - echo "Unknown quant argument: Expected one of: [fp8, nvfp4, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2 - exit 1 -esac - -case $TP in - 1|2|4|8) - ;; - *) - echo "Unknown tp argument: Expected one of: [1, 2, 4, 8]" >&2 - exit 1 -esac - -case $PP in - 1|2|4|8) + fp8|int4_awq|w4a8_awq|nvfp4) ;; *) - echo "Unknown pp argument: Expected one of: [1, 2, 4, 8]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, int4_awq, w4a8_awq, nvfp4]" >&2 exit 1 esac -GPU_NAME=$(nvidia-smi --id 0 --query-gpu=name --format=csv,noheader,nounits | sed 's/ /_/g') - -if [ "${MODEL_TYPE}" = "phi" ]; then - BUILD_MAX_INPUT_LEN=4096 -else - BUILD_MAX_INPUT_LEN=1024 -fi - -BUILD_MAX_OUTPUT_LEN=512 - -if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ] || [ "$MODEL_TYPE" = "qwen" ]; then - BUILD_MAX_BATCH_SIZE=20 -else - BUILD_MAX_BATCH_SIZE=4 -fi - - -echo "Using the following config: max input $BUILD_MAX_INPUT_LEN max output $BUILD_MAX_OUTPUT_LEN max batch $BUILD_MAX_BATCH_SIZE" - script_dir="$(dirname "$(readlink -f "$0")")" pushd $script_dir/.. @@ -108,15 +50,10 @@ if [ -z "$ROOT_SAVE_PATH" ]; then ROOT_SAVE_PATH=$(pwd) fi -MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g') -SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}_${SPARSITY_FMT}_${QFORMAT}_tp${TP}_pp${PP} - -if [ $EXPORT_FORMAT != "tensorrt_llm" ]; then - SAVE_PATH=${SAVE_PATH}_${EXPORT_FORMAT} -fi +MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}} +SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME} MODEL_CONFIG=${SAVE_PATH}/config.json -ENGINE_DIR=${SAVE_PATH}/${MODEL_TYPE}_${TP}x${PP}x${GPU_NAME}_input${BUILD_MAX_INPUT_LEN}_output${BUILD_MAX_OUTPUT_LEN}_batch${BUILD_MAX_BATCH_SIZE}_engine if [ "${REMOVE_EXISTING_MODEL_CONFIG,,}" = "true" ]; then rm -f $MODEL_CONFIG @@ -132,27 +69,9 @@ if $TRUST_REMOTE_CODE; then PTQ_ARGS+=" --trust_remote_code " fi -case "${MODEL_TYPE}" in - "vila") - VISUAL_FEATURE=196 - VLM_ARGS=" --max_multimodal_len=$((BUILD_MAX_BATCH_SIZE * VISUAL_FEATURE)) " - ;; - "phi") - VISUAL_FEATURE=4096 - VLM_ARGS=" --max_multimodal_len=$((BUILD_MAX_BATCH_SIZE * VISUAL_FEATURE)) " - ;; - "llava") - VISUAL_FEATURE=576 - VLM_ARGS=" --max_multimodal_len=$((BUILD_MAX_BATCH_SIZE * VISUAL_FEATURE)) " - ;; - "mllama") - PTQ_ARGS+=" --kv_cache_qformat none " - VLM_ARGS=" --max_encoder_input_len=6404 --skip_run" - ;; - "qwen") - PTQ_ARGS+=" --kv_cache_qformat none " - ;; -esac +if [ -n "$KV_CACHE_QUANT" ]; then + PTQ_ARGS+=" --kv_cache_qformat=$KV_CACHE_QUANT " +fi if [ "${MODEL_TYPE}" = "vila" ]; then # Install required dependency for VILA @@ -167,102 +86,47 @@ if [ "${MODEL_TYPE}" = "vila" ]; then fi fi -if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_DIR) ]]; then +if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then if ! [ -f $MODEL_CONFIG ]; then echo "Quantizing original model..." python ../llm_ptq/hf_ptq.py \ --pyt_ckpt_path=$MODEL_PATH \ --export_path=$SAVE_PATH \ - --sparsity_fmt=$SPARSITY_FMT \ --qformat=$QFORMAT \ --calib_size=$CALIB_SIZE \ --batch_size=$CALIB_BATCH_SIZE \ - --inference_tensor_parallel=$TP \ - --inference_pipeline_parallel=$PP \ - --export_fmt=$EXPORT_FORMAT \ - --no-verbose \ $PTQ_ARGS else echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage" fi - - if [ $EXPORT_FORMAT != "tensorrt_llm" ]; then - echo "Please continue deployment with $EXPORT_FORMAT. Checkpoint export_path: $SAVE_PATH" - exit 0 - fi - - - echo "Building tensorrt_llm engine from Model Optimizer-quantized model..." - - python ../llm_ptq/modelopt_to_tensorrt_llm.py \ - --model_config=$MODEL_CONFIG \ - --engine_dir=${ENGINE_DIR}/llm \ - --tokenizer=$MODEL_PATH \ - --max_input_len=$BUILD_MAX_INPUT_LEN \ - --max_output_len=$BUILD_MAX_OUTPUT_LEN \ - --max_batch_size=$BUILD_MAX_BATCH_SIZE \ - --num_build_workers=$GPUS \ - --enable_sparsity=$ENABLE_SPARSITY \ - $VLM_ARGS fi +if [[ "$QFORMAT" != "fp8" ]]; then + echo "For quant format $QFORMAT, please refer to the TensorRT-LLM documentation for deployment. Checkpoint saved to $SAVE_PATH." + exit 0 +fi -VISUAL_ARGS="" -VISION_ENCODER_DIR=${ENGINE_DIR}/vision -VISUAL_MODEL_TYPE=$MODEL_TYPE -case "${MODEL_TYPE}" in - "vila") - VISUAL_ARGS+=" --vila_path ${MODEL_PATH}/../VILA " - ;; - "phi") - VISUAL_MODEL_TYPE="phi-3-vision" - ;; - "qwen") - # Map generic type to TRT-LLM multimodal model type - VISUAL_MODEL_TYPE="qwen2_vl" - ;; -esac - - -VISUAL_MAX_BATCH_SIZE=$BUILD_MAX_BATCH_SIZE +if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then + cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1) -if [[ $TASKS =~ "build" ]] || [[ ! -d "$VISION_ENCODER_DIR" ]] || [[ ! $(ls -A $VISION_ENCODER_DIR) ]]; then - echo "Build visual engine" - python vlm_visual_engine.py \ - --model_path $MODEL_PATH \ - --model_type $VISUAL_MODEL_TYPE \ - --output_dir $VISION_ENCODER_DIR \ - --max_batch_size $VISUAL_MAX_BATCH_SIZE \ - $VISUAL_ARGS + if [ "$cuda_major" -lt 10 ]; then + echo "Please deploy the NVFP4 checkpoint on a Blackwell GPU. Checkpoint export_path: $SAVE_PATH" + exit 0 + fi fi -VLM_RUN_ARGS="" -case "${MODEL_TYPE}" in - "mllama") - VLM_RUN_ARGS+=" --image_path https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg --input_text \"<|image|><|begin_of_text|>If I had to write a haiku for this one\" --max_new_tokens 50 --batch_size 2 " - ;; -esac -echo "Run inference example" - -mpirun -n $GPUS --allow-run-as-root python vlm_run.py \ - --hf_model_dir $MODEL_PATH \ - --engine_dir $ENGINE_DIR \ - --kv_cache_free_gpu_memory_fraction $KV_CACHE_FREE_GPU_MEMORY_FRACTION \ - $VLM_RUN_ARGS +# Prepare datasets for TRT-LLM benchmark +if [ -z "$TRT_LLM_CODE_PATH" ]; then + TRT_LLM_CODE_PATH=/app/tensorrt_llm # default path for the TRT-LLM release docker image + echo "Setting default TRT_LLM_CODE_PATH to $TRT_LLM_CODE_PATH." +fi -if [[ $TASKS =~ "gqa" ]]; then - echo "Evaluating the TensorRT engine of the quantized model using GQA benchmark." - pushd ../vlm_eval/ - if [[ "$MODEL_PATH" =~ ^/ ]]; then - # If MODEL_PATH is absolute path - source gqa.sh --hf_model $MODEL_PATH --engine_dir $ENGINE_DIR --kv_cache_free_gpu_memory_fraction $KV_CACHE_FREE_GPU_MEMORY_FRACTION - else - # If MODEL_PATH is absolute path - script_parent_dir=$(dirname "$script_dir") - source gqa.sh --hf_model $script_parent_dir/$MODEL_PATH --engine_dir $ENGINE_DIR --kv_cache_free_gpu_memory_fraction $KV_CACHE_FREE_GPU_MEMORY_FRACTION - fi +QUICK_START_MULTIMODAL=$TRT_LLM_CODE_PATH/examples/llm-api/quickstart_multimodal.py - popd +if [ -f "$QUICK_START_MULTIMODAL" ]; then + python3 $QUICK_START_MULTIMODAL --model_dir $SAVE_PATH --modality image +else + echo "Warning: $QUICK_START_MULTIMODAL cannot be found. Please set TRT_LLM_CODE_PATH to the TRT-LLM code path or test the quantized checkpoint $SAVE_PATH with the TRT-LLM repo directly." fi popd diff --git a/examples/vlm_ptq/vlm_run.py b/examples/vlm_ptq/vlm_run.py deleted file mode 100644 index 7c84ae617..000000000 --- a/examples/vlm_ptq/vlm_run.py +++ /dev/null @@ -1,128 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import tensorrt_llm -import tensorrt_llm.profiler as profiler -from tensorrt_llm import logger -from tensorrt_llm.runtime import MultimodalModelRunner -from utils import add_common_args - - -def print_result(model, input_text, output_text, args): - logger.info("---------------------------------------------------------") - if model.model_type != "nougat": - logger.info(f"\n[Q] {input_text}") - for i in range(len(output_text)): - logger.info(f"\n[A]: {output_text[i]}") - - if args.num_beams == 1: - output_ids = model.tokenizer(output_text[0][0], add_special_tokens=False)["input_ids"] - logger.info(f"Generated {len(output_ids)} tokens") - - if args.check_accuracy and model.model_type != "nougat": - if model.model_type == "vila": - for i in range(len(args.image_path.split(args.path_sep))): - if i % 2 == 0: - assert output_text[i][0].lower() == ( - "the image captures a bustling city intersection teeming with life. " - "from the perspective of a car's dashboard camera, we see" - ) - else: - assert output_text[i][0].lower() == ( - "the image captures the iconic merlion statue in singapore, " - "a renowned worldwide landmark. the merlion, a mythical" - ) - elif model.model_type == "llava": - for i in range(len(args.image_path.split(args.path_sep))): - assert output_text[i][0].lower() == "singapore" - elif model.model_type == "fuyu": - assert output_text[0][0].lower() == "4" - elif model.model_type == "pix2struct": - assert ( - "characteristic | cat food, day | cat food, wet | cat treats" - in output_text[0][0].lower() - ) - elif model.model_type in ["blip2", "neva", "phi-3-vision", "llava_next"]: - assert "singapore" in output_text[0][0].lower() - elif model.model_type == "video-neva": - assert "robot" in output_text[0][0].lower() - elif model.model_type == "kosmos-2": - assert "snowman" in output_text[0][0].lower() - elif model.model_type == "mllama": - if "If I had to write a haiku for this one" in input_text: - assert ( - "it would be:.\\nPeter Rabbit is a rabbit.\\nHe lives in a" in output_text[0][0] - or "Here is a haiku for the image:\n\n" in output_text[0][0] - ), ( - f"expected results: 'it would be:.\\nPeter Rabbit is a rabbit.\\nHe lives in a', \ - generated results: '{output_text[0][0]}'" - ) - elif "The key to life is" in input_text: - assert ( - "to find your passion and pursue it with all your heart." in output_text[0][0] - or "not to be found in the external world," in output_text[0][0] - ), ( - f"expected results: 'to find your passion and pursue it with all your heart.', \ - generated results: '{output_text[0][0]}'" - ) - elif model.model_type == "llava_onevision": - if args.video_path is None: - assert "singapore" in output_text[0][0].lower() - else: - assert ( - "the video is funny because the child's actions are" - in output_text[0][0].lower() - ) - elif model.model_type == "qwen2_vl": - assert "dog" in output_text[0][0].lower() - else: - assert output_text[0][0].lower() == "singapore" - - if args.run_profiling: - - def msec_per_batch(name): - return 1000 * profiler.elapsed_time_in_sec(name) / args.profiling_iterations - - logger.info("Latencies per batch (msec)") - logger.info("TRT vision encoder: {:.1f}".format(msec_per_batch("Vision"))) - logger.info("TRTLLM LLM generate: {:.1f}".format(msec_per_batch("LLM"))) - logger.info("Multimodal generate: {:.1f}".format(msec_per_batch("Generate"))) - - logger.info("---------------------------------------------------------") - - -if __name__ == "__main__": - os.environ["TOKENIZERS_PARALLELISM"] = "false" - parser = argparse.ArgumentParser() - parser = add_common_args(parser) - args = parser.parse_args() - logger.set_level(args.log_level) - - model = MultimodalModelRunner(args) - input_multimodal_data = model.load_test_data(args.image_path, args.video_path) - - num_iters = args.profiling_iterations if args.run_profiling else 1 - - for _ in range(num_iters): - input_text, output_text = model.run( - args.input_text, input_multimodal_data, None, args.max_new_tokens - ) - - runtime_rank = tensorrt_llm.mpi_rank() - if runtime_rank == 0: - print_result(model, input_text, output_text, args) diff --git a/examples/vlm_ptq/vlm_visual_engine.py b/examples/vlm_ptq/vlm_visual_engine.py deleted file mode 100644 index c21c7b532..000000000 --- a/examples/vlm_ptq/vlm_visual_engine.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -from tensorrt_llm.tools.multimodal_builder import MultimodalEngineBuilder, add_multimodal_arguments - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser = add_multimodal_arguments(parser) - args = parser.parse_args() - - builder = MultimodalEngineBuilder(args) - builder.build() diff --git a/modelopt/deploy/llm/generate.py b/modelopt/deploy/llm/generate.py index d6f71bdaa..39f627985 100644 --- a/modelopt/deploy/llm/generate.py +++ b/modelopt/deploy/llm/generate.py @@ -21,17 +21,17 @@ from pathlib import Path from typing import Any +import tensorrt_llm import torch +from packaging.version import parse as parse_version from tensorrt_llm import SamplingParams -from tensorrt_llm.bindings.executor import DecodingConfig try: from tensorrt_llm.llmapi import CudaGraphConfig from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig - from tensorrt_llm.llmapi.llm import _TorchLLM, _TrtLLM - from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer + from tensorrt_llm.llmapi.llm import LLM as TRTLLM except ImportError: - print("Please upgrade tensorrt-llm to 1.0.0rc or later") + print("Please upgrade tensorrt-llm to 1.1.0rc2 or later") raise @@ -50,53 +50,57 @@ def _sanitize_temperature_and_top_p(temperature, top_p): return kwargs -class LLM: +class LLM(TRTLLM): """A wrapper over the ``tensorrt_llm.llmapi.llm.LLM`` for LLM profiling and validation.""" - def _build_trt_llm_from_config( - self, config, engine_dir, tokenizer, kv_cache_config, medusa_choices, max_batch_size + def __init__( + self, + checkpoint_dir: str | Path, + tokenizer: "str | Path | None" = None, + kv_cache_config: dict[str, int | float] = {}, + medusa_choices: Any = None, + tp: int = 0, + trust_remote_code: bool = False, + max_batch_size: int = 0, ): - build_config = config["build_config"] - world_size = config.get("pretrained_config", {}).get("mapping", {}).get("world_size", 1) - max_batch_size = max(max_batch_size, build_config["max_batch_size"]) - max_tokens_kv_cache = build_config["max_seq_len"] * max_batch_size - - trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) - - # If not specified, free_gpu_memory_fraction is set to the default TRT LLM value 0.9 - trt_kv_cache_config.free_gpu_memory_fraction = kv_cache_config.get( - "free_gpu_memory_fraction", 0.9 - ) + """Initializes the LLM runner class. - # If not specified, max_tokens is set to the max value calculated above. - trt_kv_cache_config.max_tokens = kv_cache_config.get("max_tokens", max_tokens_kv_cache) + Args: + checkpoint_dir: the directory path of the model checkpoint. + tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model. + kv_cache_config: the kv cache config as a dict. Please refer to + https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/ + medusa_choices: The medusa choices for the decoding config. + tp: the tensor parallel size (for the torch backend). If 0, it will be set to the number of GPUs. + trust_remote_code: whether to trust the remote code (for the torch backend). + max_batch_size: Max batch size for the LLM backend. If 0, it is not specified. + """ + with open(Path(checkpoint_dir) / "config.json") as config_file: + config = json.load(config_file) - kwargs = {} - if medusa_choices is not None: - decoding_config = DecodingConfig() - decoding_config.medusa_choices = medusa_choices - kwargs["decoding_config"] = decoding_config - assert world_size == 1, "decoding_config does not support multi TP in HLAPI." - - if tokenizer is None: - # Assume the tokenizer is stored in the engine_dir if not specified. - tokenizer = engine_dir - - # CustomSentencePieceTokenizer will not be recognized by llmapi, wrapping it around TransformersTokenizer - if type(tokenizer).__name__ in ["CustomSentencePieceTokenizer"]: - tokenizer = TransformersTokenizer(tokenizer) - - self.llm = _TrtLLM( - backend=None, - model=engine_dir, - tokenizer=tokenizer, - kv_cache_config=trt_kv_cache_config, - **kwargs, - ) + assert medusa_choices is None, "medusa_choices is not supported with the torch llmapi" + + def _find_max_position_embeddings(cfg: dict) -> int | None: + if "max_position_embeddings" in cfg: + return cfg["max_position_embeddings"] + for v in cfg.values(): + if isinstance(v, dict): + res = _find_max_position_embeddings(v) + if res is not None: + return res + return None + + # Some VLMs may have a sub-config for max_position_embeddings, so we need to find it. + self._max_seq_len = _find_max_position_embeddings(config) + if self._max_seq_len is None: + warnings.warn( + "max_position_embeddings not found in config.json, using default value 8192" + ) + self._max_seq_len = 8192 + else: + print(f"max_position_embeddings: {self._max_seq_len}") + self._max_beam_width = 1 - def _build_torch_llm_from_config( - self, checkpoint_dir, tokenizer, tp, trust_remote_code, max_batch_size - ): kwargs = {} if tokenizer is not None: kwargs["tokenizer"] = tokenizer @@ -104,9 +108,19 @@ def _build_torch_llm_from_config( if tp < 1: tp = torch.cuda.device_count() + # Check if any key in config contains both "num" and "experts" + ep = 1 + enable_attention_dp = False + for k in config: + if "num" in k and "experts" in k: + ep = torch.cuda.device_count() + enable_attention_dp = True + break + # Sometimes 90% of the GPU memory is not enough for the TRT LLM torch engine. - trt_kv_cache_config = TRT_KvCacheConfig( - enable_block_reuse=False, free_gpu_memory_fraction=0.85 + trt_kv_cache_config = TRT_KvCacheConfig(free_gpu_memory_fraction=0.7) + trt_kv_cache_config.max_tokens = self._max_seq_len * ( + max_batch_size if max_batch_size > 0 else 8 ) cuda_graph_config = None @@ -118,90 +132,24 @@ def _build_torch_llm_from_config( enable_padding=True, ) - self.llm = _TorchLLM( + self._support_context_logits_and_stop_words = parse_version( + tensorrt_llm.__version__ + ) >= parse_version("1.1.0rc2") + + super().__init__( backend="pytorch", model=checkpoint_dir, tensor_parallel_size=tp, + moe_expert_parallel_size=ep, trust_remote_code=trust_remote_code, enable_chunked_prefill=True, kv_cache_config=trt_kv_cache_config, # pytorch backend configs cuda_graph_config=cuda_graph_config, + enable_attention_dp=enable_attention_dp, **kwargs, ) - def __init__( - self, - checkpoint_dir: str | Path, - tokenizer: "str | Path | TokenizerBase | None" = None, - kv_cache_config: dict[str, int | float] = {}, - medusa_choices: Any = None, - tp: int = 0, - trust_remote_code: bool = False, - max_batch_size: int = 0, - ): - """Initializes the LLM runner class. - - Args: - engine_dir: the directory path of the TensorRT-LLM engine. - tokenizer: the tokenizer. For example, a tokenizer from the Huggingface model. - kv_cache_config: the kv cache config as a dict. Please refer to - https://nvidia.github.io/TensorRT-LLM/performance/performance-tuning-guide/ - medusa_choices: The medusa choices for the decoding config. - tp: the tensor parallel size (for the torch backend). If 0, it will be set to the number of GPUs. - trust_remote_code: whether to trust the remote code (for the torch backend). - max_batch_size: Max batch size for the LLM backend. If 0, it will be set to the max batch size - in the engine config. - """ - with open(Path(checkpoint_dir) / "config.json") as config_file: - config = json.load(config_file) - - if "build_config" in config: - self._is_torch = False - self._build_trt_llm_from_config( - config, - checkpoint_dir, - tokenizer, - kv_cache_config, - medusa_choices, - max_batch_size, - ) - - self._max_seq_len = self.llm.args.build_config.max_seq_len - self._max_beam_width = self.llm.args.build_config.max_beam_width - self._gather_context_logits = self.llm.args.build_config.gather_context_logits - else: - self._is_torch = True - assert medusa_choices is None, ( - "medusa_choices is not supported with the torch llmapi" - ) - - self._build_torch_llm_from_config( - checkpoint_dir, tokenizer, tp, trust_remote_code, max_batch_size - ) - - def _find_max_position_embeddings(cfg: dict) -> int | None: - if "max_position_embeddings" in cfg: - return cfg["max_position_embeddings"] - for v in cfg.values(): - if isinstance(v, dict): - res = _find_max_position_embeddings(v) - if res is not None: - return res - return None - - # Some VLMs may have a sub-config for max_position_embeddings, so we need to find it. - self._max_seq_len = _find_max_position_embeddings(config) - if self._max_seq_len is None: - warnings.warn( - "max_position_embeddings not found in config.json, using default value 8192" - ) - self._max_seq_len = 8192 - else: - print(f"max_position_embeddings: {self._max_seq_len}") - self._max_beam_width = 1 - self._gather_context_logits = False - @property def max_seq_len(self): """Get the max sequence length from the LLM instance.""" @@ -215,7 +163,7 @@ def max_beam_width(self): @property def gather_context_logits(self): """Returns whether the context_logits can be returned from the LLM instance.""" - return self._gather_context_logits + return self._support_context_logits_and_stop_words def _generate( self, @@ -227,10 +175,8 @@ def _generate( ): assert temperature >= 0.0, "Temperature must be greater than 0.0." - # TODO: Remove this once torch backend supports stop words - if self._is_torch: + if not self._support_context_logits_and_stop_words: stop_words = None - beam_width = self.max_beam_width kwargs = _sanitize_temperature_and_top_p(temperature, top_p) sampling_config = SamplingParams( @@ -241,7 +187,7 @@ def _generate( **kwargs, ) - return self.llm.generate(prompts, sampling_params=sampling_config, use_tqdm=False) + return self.generate(prompts, sampling_params=sampling_config, use_tqdm=False) def generate_tokens( self, @@ -330,8 +276,8 @@ def generate_context_logits( Returns: a tensor list of the context_logits. """ - assert self.gather_context_logits, ( - "Please enable gather_context_logits flag when building the engine." + assert self._support_context_logits_and_stop_words, ( + "Context logits are not supported with the current tensorrt_llm version." ) assert temperature >= 0.0, "Temperature must be greater than 0.0." @@ -340,6 +286,6 @@ def generate_context_logits( sampling_config = SamplingParams(max_tokens=1, use_beam_search=True, best_of=1, **kwargs) - outputs = self.llm.generate(prompts, sampling_params=sampling_config, use_tqdm=False) + outputs = self.generate(prompts, sampling_params=sampling_config, use_tqdm=False) return [output.context_logits for output in outputs] diff --git a/tests/_test_utils/examples/run_command.py b/tests/_test_utils/examples/run_command.py index cf31ce38c..ebae8c80a 100644 --- a/tests/_test_utils/examples/run_command.py +++ b/tests/_test_utils/examples/run_command.py @@ -123,16 +123,16 @@ def run_llm_export_command( def run_llm_ptq_command(*, model: str, quant: str, **kwargs): kwargs.update({"model": model, "quant": quant}) - kwargs.setdefault("tasks", "build") + kwargs.setdefault("tasks", "quant") kwargs.setdefault("calib", 16) cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh", "--no-verbose"], **kwargs) run_example_command(cmd_parts, "llm_ptq") -def run_vlm_ptq_command(*, model: str, type: str, quant: str, **kwargs): - kwargs.update({"model": model, "type": type, "quant": quant}) - kwargs.setdefault("tasks", "build") +def run_vlm_ptq_command(*, model: str, quant: str, **kwargs): + kwargs.update({"model": model, "quant": quant}) + kwargs.setdefault("tasks", "quant") kwargs.setdefault("calib", 16) cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh"], **kwargs) diff --git a/tests/_test_utils/model.py b/tests/_test_utils/model.py index 6e2fe17f7..abedd7b2a 100644 --- a/tests/_test_utils/model.py +++ b/tests/_test_utils/model.py @@ -63,6 +63,11 @@ def _select_path(remote_id: str, local_id: str) -> str: local_id="llava-1.5-7b-hf", ) +QWEN_VL_PATH = _select_path( + remote_id="Qwen/Qwen2-VL-2B-Instruct", + local_id="Qwen2-VL-2B-Instruct", +) + # Diffusers FLUX_SCHNELL_PATH = _select_path( remote_id="hf-internal-testing/tiny-flux-pipe", diff --git a/tests/_test_utils/ptq_utils.py b/tests/_test_utils/ptq_utils.py index f943faadb..892540707 100644 --- a/tests/_test_utils/ptq_utils.py +++ b/tests/_test_utils/ptq_utils.py @@ -27,8 +27,7 @@ @dataclass class PTQCommand: quant: str - export_fmt: str = "tensorrt_llm" - tasks: str = "build" + tasks: str = "quant" calib: int = 16 sparsity: str | None = None kv_cache_quant: str | None = None @@ -38,7 +37,9 @@ class PTQCommand: tp: int | None = None pp: int | None = None min_sm: int | None = None + max_sm: int | None = None min_gpu: int | None = None + batch: int | None = None def run(self, model_path: str): if self.min_sm and torch.cuda.get_device_capability() < ( @@ -48,6 +49,13 @@ def run(self, model_path: str): pytest.skip(reason=f"Requires sm{self.min_sm} or higher") return + if self.max_sm and torch.cuda.get_device_capability() > ( + self.max_sm // 10, + self.max_sm % 10, + ): + pytest.skip(reason=f"Requires sm{self.max_sm} or lower") + return + if self.min_gpu and torch.cuda.device_count() < self.min_gpu: pytest.skip(reason=f"Requires at least {self.min_gpu} GPUs") return diff --git a/tests/examples/llm_eval/test_llm_eval.py b/tests/examples/llm_eval/test_llm_eval.py index 5da65cae0..d745df85d 100644 --- a/tests/examples/llm_eval/test_llm_eval.py +++ b/tests/examples/llm_eval/test_llm_eval.py @@ -16,20 +16,22 @@ import subprocess from _test_utils.examples.run_command import run_llm_ptq_command +from _test_utils.model import TINY_LLAMA_PATH from _test_utils.torch_misc import minimum_sm @minimum_sm(89) -def test_llama_eval_fp8(tiny_llama_path): +def test_llama_eval_fp8(): try: run_llm_ptq_command( - model=tiny_llama_path, + model=TINY_LLAMA_PATH, quant="fp8", - tasks="mmlu,lm_eval,simple_eval,benchmark", + tasks="mmlu,lm_eval,simple_eval", calib=64, lm_eval_tasks="hellaswag,gsm8k", simple_eval_tasks="humaneval", lm_eval_limit=0.1, + batch=8, ) finally: # Force kill llm-serve if it's still running diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py index b3eccf2b6..1da11a8c7 100644 --- a/tests/examples/llm_ptq/test_llm_ptq.py +++ b/tests/examples/llm_ptq/test_llm_ptq.py @@ -14,8 +14,6 @@ # limitations under the License. -import os - import pytest from _test_utils.model import BART_PATH, MIXTRAL_PATH, T5_PATH, TINY_LLAMA_PATH, WHISPER_PATH from _test_utils.ptq_utils import PTQCommand, WithRequirements @@ -24,7 +22,6 @@ @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), PTQCommand(quant="fp8", min_sm=89), ], ids=PTQCommand.param_str, @@ -39,7 +36,6 @@ class TestT5(WithRequirements): @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), PTQCommand(quant="fp8", min_sm=89), ], ids=PTQCommand.param_str, @@ -51,9 +47,7 @@ def test_ptq_t5(self, command): @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), - PTQCommand(quant="fp8", min_sm=89), - PTQCommand(quant="fp8", export_fmt="hf", min_sm=89), + PTQCommand(quant="fp8", min_sm=90), ], ids=PTQCommand.param_str, ) @@ -71,7 +65,6 @@ class TestWhisper(WithRequirements): "command", [ # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size - PTQCommand(quant="fp16", calib_batch_size=16), PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), ], ids=PTQCommand.param_str, @@ -80,84 +73,47 @@ def test_ptq_whisper(self, command): command.run(WHISPER_PATH) -@pytest.fixture(scope="module") -def llama_path(tiny_llama_path): - fast_tests = os.getenv("MODELOPT_FAST_TESTS", "true").lower() == "true" - if fast_tests: - return tiny_llama_path - return TINY_LLAMA_PATH - - @pytest.mark.parametrize( "command", [ - PTQCommand(quant="fp16"), - PTQCommand(quant="bf16"), - PTQCommand(quant="int8_sq"), - # ("int8_sq", "tensorrt_llm", "sparsegpt"), - PTQCommand(quant="int4_awq"), - PTQCommand(quant="int4_awq", export_fmt="hf"), + PTQCommand(quant="int8_sq", kv_cache_quant="none"), + PTQCommand(quant="int8_sq", kv_cache_quant="none", tp=2, pp=2), + PTQCommand(quant="int4_awq", kv_cache_quant="none"), + PTQCommand(quant="w4a8_awq", kv_cache_quant="none"), PTQCommand(quant="nvfp4"), - PTQCommand(quant="nvfp4", export_fmt="hf"), PTQCommand(quant="nvfp4_awq"), - PTQCommand(quant="nvfp4_awq", export_fmt="hf"), - # # autoquant PTQCommand( quant="int4_awq,nvfp4,fp8,w4a8_awq", calib_batch_size=4, auto_quantize_bits=6.4, + kv_cache_quant="none", ), - PTQCommand( - quant="int4_awq,nvfp4,fp8", - export_fmt="hf", - calib_batch_size=4, - auto_quantize_bits=6.4, - ), - # # kv_cache PTQCommand(quant="nvfp4_awq", kv_cache_quant="nvfp4"), - PTQCommand(quant="nvfp4_awq", export_fmt="hf", kv_cache_quant="nvfp4"), - # ("nvfp4_awq", "tensorrt_llm", "nvfp4_affine"), - # ("nvfp4_awq", "hf", "nvfp4_affine"), - # # autoquant_kv_cache PTQCommand( - quant="int4_awq,nvfp4,fp8,w4a8_awq", - kv_cache_quant="nvfp4", + quant="nvfp4,fp8", + kv_cache_quant="fp8", calib_batch_size=4, auto_quantize_bits=6.4, ), PTQCommand( - quant="int4_awq,nvfp4,fp8,w4a8_awq", - export_fmt="hf", + quant="nvfp4,fp8", kv_cache_quant="nvfp4", calib_batch_size=4, auto_quantize_bits=6.4, ), - # ("int4_awq,nvfp4,fp8,w4a8_awq", "tensorrt_llm", "nvfp4_affine"), - # ("int4_awq,nvfp4,fp8,w4a8_awq", "hf", "nvfp4_affine"), - # # sm89 PTQCommand(quant="fp8", min_sm=89), - PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89), - # ("fp8", "tensorrt_llm", "sparsegpt", None), - PTQCommand(quant="fp8", export_fmt="hf", min_sm=89), - PTQCommand(quant="w4a8_awq", min_sm=89), + PTQCommand(quant="fp8", kv_cache_quant="none", min_sm=89), # sm100 + PTQCommand(quant="nvfp4", min_sm=100), # # multi_gpu - # TP - PTQCommand(quant="fp16", tp=2, pp=1, min_gpu=2), - # ("fp16", "build", "sparsegpt", 1), - PTQCommand(quant="nvfp4", tp=2, pp=1, min_gpu=2), - PTQCommand(quant="fp16", tasks="benchmark", tp=2, pp=1, min_gpu=2), - # ("fp16", "benchmark", "sparsegpt", 2, 1), - # PP - # ("nvfp4", "build", None, 1, 2), - # ("fp16", "build", None, 1, 2), - # ("fp16", "build", "sparsegpt", 1, 2), + PTQCommand(quant="fp8", min_gpu=2, min_sm=89), + PTQCommand(quant="nvfp4", min_gpu=2, min_sm=100), ], ids=PTQCommand.param_str, ) -def test_ptq_llama(command, llama_path): - command.run(llama_path) +def test_ptq_llama(command): + command.run(TINY_LLAMA_PATH) diff --git a/tests/examples/speculative_decoding/test_medusa.py b/tests/examples/speculative_decoding/test_medusa.py index 58395c14d..27f74edaf 100644 --- a/tests/examples/speculative_decoding/test_medusa.py +++ b/tests/examples/speculative_decoding/test_medusa.py @@ -29,7 +29,7 @@ def install_transformers_lt_4_50(): # fmt: off -def _run_hf_ptq(model_path, output_dir, qformat, export_fmt): +def _run_hf_ptq(model_path, output_dir, qformat): run_example_command( [ "python", "hf_ptq.py", @@ -38,7 +38,6 @@ def _run_hf_ptq(model_path, output_dir, qformat, export_fmt): "--calib_size", "64", "--export_path", output_dir, "--qformat", qformat, - "--export_fmt", export_fmt, ], "llm_ptq", ) @@ -66,8 +65,7 @@ def test_llama_medusa_fp8_qat(tiny_llama_path, num_gpus, tiny_daring_anteater_pa ) # Test PTQ on Medusa - _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-trtllm", "fp8", "tensorrt_llm") - _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8", "hf") + _run_hf_ptq(medusa_path, tmp_path / "medusa-tinyllama-hf", "fp8") # Test QAT on Medusa run_example_command( diff --git a/tests/examples/vlm_ptq/test_llava.py b/tests/examples/vlm_ptq/test_qwen_vl.py similarity index 80% rename from tests/examples/vlm_ptq/test_llava.py rename to tests/examples/vlm_ptq/test_qwen_vl.py index c811bd086..1f06f3a5f 100644 --- a/tests/examples/vlm_ptq/test_llava.py +++ b/tests/examples/vlm_ptq/test_qwen_vl.py @@ -16,11 +16,11 @@ import pytest from _test_utils.examples.run_command import run_vlm_ptq_command -from _test_utils.model import LLAVA_PATH +from _test_utils.model import QWEN_VL_PATH from _test_utils.torch_misc import minimum_gpu -@pytest.mark.parametrize("quant", ["fp16"]) +@pytest.mark.parametrize("quant", ["fp8", "int8_sq", "nvfp4"]) @minimum_gpu(2) -def test_llava_multi_gpu(quant): - run_vlm_ptq_command(model=LLAVA_PATH, type="llava", quant=quant, tp=2) +def test_qwen_vl_multi_gpu(quant): + run_vlm_ptq_command(model=QWEN_VL_PATH, quant=quant) diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 86c9b0e4b..c3d5653c5 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -38,7 +38,6 @@ ("nvfp4_awq", "tiny_llama-nvfp4-awq", True, False, True, True), ("int4_awq", "tiny_llama-int4-awq", True, False, True, True), ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True), - ("fp8", "t5_tiny-fp8", True, False, True, True), ], ) def test_unified_hf_export_and_check_safetensors( @@ -83,8 +82,6 @@ def test_unified_hf_export_and_check_safetensors( str(tiny_model_dir), "--qformat", qformat, - "--export_fmt", - "hf", "--export_path", str(output_dir), ] From 3303f351a92f2176aa75b44fbeb913123d59d638 Mon Sep 17 00:00:00 2001 From: yueshen2016 <39203804+yueshen2016@users.noreply.github.com> Date: Wed, 17 Sep 2025 18:19:51 -0700 Subject: [PATCH 23/27] Fix issue of attention.core_attention is None (#334) Signed-off-by: Yue Signed-off-by: Ye Yu --- modelopt/torch/export/unified_export_megatron.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py index e28a165fa..586745a1b 100644 --- a/modelopt/torch/export/unified_export_megatron.py +++ b/modelopt/torch/export/unified_export_megatron.py @@ -1085,7 +1085,10 @@ def _get_state_dict(self): self.rules["k_layernorm"](layer.self_attention.k_layernorm, layer_id) self.rules["linear_qkv"](layer.self_attention.linear_qkv, layer_id) self.rules["linear_proj"](layer.self_attention.linear_proj, layer_id) - if hasattr(layer.self_attention.core_attention, "softmax_offset"): + if ( + getattr(layer.self_attention.core_attention, "softmax_offset", None) + is not None + ): self.rules["softmax_offset"]( layer.self_attention.core_attention.softmax_offset, layer_id ) From 0f7b7eafc9520b34a18ff4222c6cf0e4721c2ab6 Mon Sep 17 00:00:00 2001 From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Date: Wed, 17 Sep 2025 22:13:01 -0700 Subject: [PATCH 24/27] Reinstate int8_sq support for vlm_example. (#333) Signed-off-by: Chenjie Luo Signed-off-by: Ye Yu --- CHANGELOG.rst | 1 - examples/vlm_ptq/scripts/huggingface_example.sh | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6fde2bcb9..be9d5e7ea 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,7 +9,6 @@ Model Optimizer Changelog (Linux) - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. - Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly. - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format. -- ``int8_sq`` quantization format is deprecated from the ``examples/vlm_ptq`` with respect to the TensorRT-LLM's torch backend switch. Please refer to the previous releases if this quantization format is needed. - Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend. **New Features** diff --git a/examples/vlm_ptq/scripts/huggingface_example.sh b/examples/vlm_ptq/scripts/huggingface_example.sh index 69e2dce9e..ea2733067 100755 --- a/examples/vlm_ptq/scripts/huggingface_example.sh +++ b/examples/vlm_ptq/scripts/huggingface_example.sh @@ -35,10 +35,10 @@ if [ -z "$MODEL_PATH" ]; then fi case $QFORMAT in - fp8|int4_awq|w4a8_awq|nvfp4) + fp8|int8_sq|int4_awq|w4a8_awq|nvfp4) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, int4_awq, w4a8_awq, nvfp4]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, nvfp4]" >&2 exit 1 esac @@ -95,6 +95,8 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH --qformat=$QFORMAT \ --calib_size=$CALIB_SIZE \ --batch_size=$CALIB_BATCH_SIZE \ + --inference_tensor_parallel=$TP \ + --inference_pipeline_parallel=$PP \ $PTQ_ARGS else echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage" From 89c65b9a7cc4e466a863c640caca49490797c37d Mon Sep 17 00:00:00 2001 From: omrialmog Date: Thu, 18 Sep 2025 07:16:43 -0700 Subject: [PATCH 25/27] Update News README.md (#336) Signed-off-by: omrialmog Signed-off-by: Ye Yu --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 72c954017..b16196b03 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ Model Optimizer is also integrated with [NVIDIA NeMo](https://github.com/NVIDIA- ## Latest News +- [2025/09/17] [An Introduction to Speculative Decoding for Reducing Latency in AI Inference](https://developer.nvidia.com/blog/an-introduction-to-speculative-decoding-for-reducing-latency-in-ai-inference/) +- [2025/09/11] [How Quantization Aware Training Enables Low-Precision Accuracy Recovery](https://developer.nvidia.com/blog/how-quantization-aware-training-enables-low-precision-accuracy-recovery/) - [2025/08/29] [Fine-Tuning gpt-oss for Accuracy and Performance with Quantization Aware Training](https://developer.nvidia.com/blog/fine-tuning-gpt-oss-for-accuracy-and-performance-with-quantization-aware-training/) - [2025/08/01] [Optimizing LLMs for Performance and Accuracy with Post-Training Quantization](https://developer.nvidia.com/blog/optimizing-llms-for-performance-and-accuracy-with-post-training-quantization/) - [2025/06/24] [Introducing NVFP4 for Efficient and Accurate Low-Precision Inference](https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/) From f7425fc8c74ed85cd0a989407e23a5e408632aee Mon Sep 17 00:00:00 2001 From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Date: Thu, 18 Sep 2025 09:51:00 -0700 Subject: [PATCH 26/27] [NVBug: 5525758] Update VLM-PTQ readme (#339) Signed-off-by: Chenjie Luo Signed-off-by: Ye Yu --- CHANGELOG.rst | 1 + examples/vlm_ptq/README.md | 54 +++++++++----------------------------- 2 files changed, 14 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index be9d5e7ea..8dc315c46 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,6 +15,7 @@ Model Optimizer Changelog (Linux) - ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default. - Upgrade TensorRT-LLM dependency to 1.1.0rc2. +- Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``. 0.35 (2025-09-04) ^^^^^^^^^^^^^^^^^ diff --git a/examples/vlm_ptq/README.md b/examples/vlm_ptq/README.md index e2d79cec5..cdadb3374 100644 --- a/examples/vlm_ptq/README.md +++ b/examples/vlm_ptq/README.md @@ -36,15 +36,19 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the- ### Supported Models -| Model | type | fp8 | int8_sq | int4_awq | w4a8_awq1 | nvfp42 | -| :---: | :---: | :---: | :---: | :---: | :---: | :---: | -| Llava | llava | ✅ | ✅ | ✅ | ✅ | ❌ | -| VILA | vila | ✅ | ✅ | ✅ | ✅ | ❌ | -| Phi-3-vision | phi | ✅ | ✅ | ✅ | ✅ | ❌ | -| Qwen2.5-VL | qwen | ✅ | ✅ | ✅ | ✅ | ❌ | +| Model | fp8 | int8_sq1 | int4_awq | w4a8_awq2 | nvfp43 | +| :---: | :---: | :---: | :---: | :---: | :---: | +| Llava | ✅ | ✅ | ✅ | ✅ | - | +| VILA | ✅ | ✅ | ✅ | ✅ | - | +| Phi-3-vision, Phi-4-multimodal | ✅ | ✅ | ✅ | ✅ | ✅ | +| Qwen2, 2.5-VL | ✅ | ✅ | ✅ | ✅ | ✅ | +| Gemma3 | ✅ | - | - | - | - | -> *1.The w4a8_awq is an experimental quantization scheme that may result in a higher accuracy penalty.* \ -> *2.A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later.* +> *1.Only TensorRT-LLM checkpoint export is supported. Not compatible with the TensorRT-LLM torch backend* \ +> *2.The w4a8_awq is an experimental quantization scheme that may result in a higher accuracy penalty.* \ +> *3.A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later.* + +> *For detailed TensorRT-LLM torch backend multimodal support, please refer to [this doc](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/models/supported-models.md#multimodal-feature-support-matrix-pytorch-backend)* > *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](../llm_ptq/hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead.* @@ -56,40 +60,8 @@ The following scripts provide an all-in-one and step-by-step model quantization ### Hugging Face Example [Script](./scripts/huggingface_example.sh) -For [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf): - -```bash -git clone https://huggingface.co/llava-hf/llava-1.5-7b-hf -scripts/huggingface_example.sh --type llava --model llava-1.5-7b-hf --quant [fp8|int8_sq|int4_awq|w4a8_awq] --tp [1|2|4|8] -``` - -For VILA models like [VILA1.5-3b](https://huggingface.co/Efficient-Large-Model/VILA1.5-3b): - -```bash -git clone https://huggingface.co/Efficient-Large-Model/VILA1.5-3b vila1.5-3b -scripts/huggingface_example.sh --type vila --model vila1.5-3b --quant [fp8|int8_sq|int4_awq|w4a8_awq] --tp [1|2|4|8] -``` - -For [Phi-3-vision](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct): - -```bash -git clone https://huggingface.co/microsoft/Phi-3-vision-128k-instruct -scripts/huggingface_example.sh --type phi --model Phi-3-vision-128k-instruct --quant [fp8|int8_sq|int4_awq|w4a8_awq] -``` - -For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): - -```bash -git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct -scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] -``` - -The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md). - -If you encounter Out of Memory (OOM) issues during inference or evaluation, you can try lowering the `--kv_cache_free_gpu_memory_fraction` argument (default is 0.8) to reduce GPU memory usage for kv_cache: - ```bash -scripts/huggingface_example.sh --type phi --model Phi-3-vision-128k-instruct --quant fp8 --kv_cache_free_gpu_memory_fraction 0.5 +scripts/huggingface_example.sh --model --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] ``` ## Pre-Quantized Checkpoints From 2ec5654aabe050fffc886d74e713a5d8bf68f938 Mon Sep 17 00:00:00 2001 From: Jenny Chen Date: Thu, 18 Sep 2025 14:24:39 -0400 Subject: [PATCH 27/27] Slurm support for QAT Simplified Flow + Qwen3-8B recipe (#285) Signed-off-by: Jennifer Chen Signed-off-by: Ye Yu --- examples/llm_qat/README.md | 1 + examples/nemo_run/common/in_memory_mmlu.py | 60 ++++ .../{ => common}/llama_chat_template.txt | 0 .../nemo_run/{ => common}/process_lima.py | 0 .../nemo_run/common/process_openscience.py | 61 ++++ examples/nemo_run/common/utils.py | 139 ++++++++ examples/nemo_run/qat/ADVANCED.md | 56 ++++ examples/nemo_run/qat/README.md | 110 +++++-- examples/nemo_run/qat/nemo_qat_flow.py | 303 ++++++++++++------ modelopt/torch/export/plugins/nemo_run.py | 71 ++++ 10 files changed, 674 insertions(+), 127 deletions(-) create mode 100644 examples/nemo_run/common/in_memory_mmlu.py rename examples/nemo_run/{ => common}/llama_chat_template.txt (100%) rename examples/nemo_run/{ => common}/process_lima.py (100%) create mode 100644 examples/nemo_run/common/process_openscience.py create mode 100644 examples/nemo_run/common/utils.py create mode 100644 examples/nemo_run/qat/ADVANCED.md create mode 100644 modelopt/torch/export/plugins/nemo_run.py diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 801f7ee22..3d8956944 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -11,6 +11,7 @@ Quantization Aware Training (QAT) helps to improve the model accuracy beyond pos | Support Matrix | View the support matrix to see quantization compatibility and feature availability across different models | \[[Link](#support-matrix)\] | | | End to End QAT | Example scripts demonstrating quantization techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qat-example)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | | End to End QAD | Example scripts demonstrating quantization aware distillation techniques for optimizing Hugging Face models | \[[Link](#end-to-end-qad-example)\] | \[[docs](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/1_quantization.html)\] | +| NeMo QAT/QAD Simplified Flow | Example script demonstrating end-to-end QAT/QAD in NeMo | \[[Link](../nemo_run/qat/README.md)\] | | | Evaluate Accuracy | Evaluating model accuracy after QAT/QAD (with fake quantization) | \[[Link](#testing-qat-model-with-llm-benchmarks-for-accuracy-evaluation)\] | | | Deployment | Deploying the model after QAT/QAD | \[[Link](#deployment)\] | | | QLoRA | Model training with reduced GPU memory | \[[Link](#end-to-end-qlora-with-real-quantization)\] | | diff --git a/examples/nemo_run/common/in_memory_mmlu.py b/examples/nemo_run/common/in_memory_mmlu.py new file mode 100644 index 000000000..c9ab11e3b --- /dev/null +++ b/examples/nemo_run/common/in_memory_mmlu.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo.collections.llm.modelopt import setup_trainer_and_restore_model_with_modelopt_spec + +from modelopt.torch.export.plugins.nemo_run import _get_most_recent_ckpt +from modelopt.torch.utils.plugins.megatron_mmlu import megatron_mmlu + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + "Run MMLU evaluation with ModelOpt Megatron model. Provide either --nemo_ckpt" + "or --finetuned_ckpt_dir" + ) + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--nemo_ckpt", type=str, required=False, help="Path to NeMo checkpoint.") + group.add_argument( + "--finetuned_ckpt_dir", + required=False, + type=str, + help="Checkpoint directory of 1 or more finetuned models", + ) + parser.add_argument( + "--tensor_parallelism", type=int, default=1, help="Tensor parallelism size." + ) + parser.add_argument( + "--pipeline_parallelism", type=int, default=1, help="Pipeline parallelism size." + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + ckpt_path = args.nemo_ckpt + if args.finetuned_ckpt_dir: + ckpt_path = _get_most_recent_ckpt(args.finetuned_ckpt_dir) + model, trainer = setup_trainer_and_restore_model_with_modelopt_spec( + ckpt_path, + tensor_model_parallel_size=args.tensor_parallelism, + pipeline_model_parallel_size=args.pipeline_parallelism, + devices=args.tensor_parallelism * args.pipeline_parallelism, + ) + tokenizer = model.tokenizer.tokenizer + megatron_mmlu(model.module, tokenizer) diff --git a/examples/nemo_run/llama_chat_template.txt b/examples/nemo_run/common/llama_chat_template.txt similarity index 100% rename from examples/nemo_run/llama_chat_template.txt rename to examples/nemo_run/common/llama_chat_template.txt diff --git a/examples/nemo_run/process_lima.py b/examples/nemo_run/common/process_lima.py similarity index 100% rename from examples/nemo_run/process_lima.py rename to examples/nemo_run/common/process_lima.py diff --git a/examples/nemo_run/common/process_openscience.py b/examples/nemo_run/common/process_openscience.py new file mode 100644 index 000000000..61172dc56 --- /dev/null +++ b/examples/nemo_run/common/process_openscience.py @@ -0,0 +1,61 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from pathlib import Path + +from datasets import load_dataset + + +def get_parser(): + parser = argparse.ArgumentParser(description="Process nvidia/OpenScience dataset") + parser.add_argument("--output-dir", type=str, default=".") + return parser + + +def convert_row_oai(row: dict): + return { + "messages": [ + {"role": "user", "content": row["input"]}, + {"role": "assistant", "content": row["output"]}, + ] + } + + +def process_subset(raw_dir, proc_dir): + ds = load_dataset(raw_dir) + ds = ds.map(convert_row_oai, remove_columns=["input", "output"]) + + split_ds = ds["train"].train_test_split(test_size=0.1) + split_ds["train"].to_json(os.path.join(proc_dir, "training.jsonl")) + split_ds["test"].to_json(os.path.join(proc_dir, "validation.jsonl")) + + +if __name__ == "__main__": + args = get_parser().parse_args() + raw_dir = f"{args.output_dir}/openscience_raw" + proc_dir = f"{args.output_dir}/openscience_proc" + + if not os.path.exists(raw_dir): + q235_subset = load_dataset("nvidia/OpenScience", data_files="OS-Q3-235B-4.jsonl") + q235_subset.save_to_disk(raw_dir) + + if not os.path.exists(proc_dir): + Path(proc_dir).mkdir(exist_ok=True) + print("Processing OpenScience dataset") + process_subset(raw_dir, proc_dir) + else: + print(f"Processed OpenScience dataset exists in: {proc_dir}, skipped processing") diff --git a/examples/nemo_run/common/utils.py b/examples/nemo_run/common/utils.py new file mode 100644 index 000000000..3f1bf8fc2 --- /dev/null +++ b/examples/nemo_run/common/utils.py @@ -0,0 +1,139 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +from dataclasses import dataclass, field + +import nemo_run as run +from nemo.collections import llm + + +@dataclass +class SlurmConfig: + """Configuration for SlurmExecutor.""" + + account: str = "" # Your Slurm account + partition_cpu: str = "" # Slurm CPU partition to use + partition_gpu: str = "" # Slurm GPU partition to use + time: str = "" # Job time limit (HH:MM:SS) + container_image: str = "" # Container image for jobs + env_vars: dict[str, str] = field(default_factory=dict) # Environment variables to set + container_mounts: list[str] = field(default_factory=list) # Container mounts + use_local_tunnel: bool = False # Set to True if running from within the cluster + host: str = "" # Required for SSH tunnel: Slurm cluster hostname + user: str = "" # Required for SSH tunnel: Your username + job_dir: str = "" # Required for SSH tunnel: Directory to store runs on cluster + identity: str | None = None # Optional for SSH tunnel: Path to SSH key for authentication + + def __post_init__(self): + """Validate the configuration and raise descriptive errors.""" + if not self.account: + raise ValueError("SlurmConfig.account must be set to your actual Slurm account") + if not self.partition_cpu: + raise ValueError("SlurmConfig.partition_cpu must be set") + if not self.partition_gpu: + raise ValueError("SlurmConfig.partition_gpu must be set") + if not self.time: + raise ValueError("SlurmConfig.time must be set to job time limit (e.g., '02:00:00')") + if not self.container_image: + raise ValueError("SlurmConfig.container_image must be set to container image for jobs") + if not self.use_local_tunnel: + # Only validate SSH tunnel settings if not using local tunnel + if not self.host: + raise ValueError( + "SlurmConfig.host must be set to your actual cluster hostname when using SSH tunnel" + ) + if not self.user: + raise ValueError( + "SlurmConfig.user must be set to your actual username when using SSH tunnel" + ) + if not self.job_dir: + raise ValueError( + "SlurmConfig.job_dir must be set to directory for storing runs on cluster" + ) + + self.env_vars |= { + "CUDA_DEVICE_MAX_CONNECTIONS": "1", # Disable GPU communication/computation overlap for performance + "TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory + } + + +def create_slurm_executor( + slurm_cfg: SlurmConfig, nodes: int = 1, ntasks_per_node: int = 1, num_gpus: int = 0 +): + # Configure tunnel + if slurm_cfg.use_local_tunnel: + # Use LocalTunnel when already on the cluster + tunnel = run.LocalTunnel(job_dir=slurm_cfg.job_dir) + else: + # Use SSH tunnel when launching from local machine + tunnel = run.SSHTunnel( + host=slurm_cfg.host, + user=slurm_cfg.user, + job_dir=slurm_cfg.job_dir, + identity=slurm_cfg.identity, # can be None + ) + + if num_gpus > 0: + return run.SlurmExecutor( + account=slurm_cfg.account, + partition=slurm_cfg.partition_gpu, + ntasks_per_node=ntasks_per_node, + gpus_per_node=num_gpus, + nodes=nodes, + tunnel=tunnel, + container_image=slurm_cfg.container_image, + container_mounts=slurm_cfg.container_mounts, + time=slurm_cfg.time, + packager=run.GitArchivePackager(), + mem="0", + gres=f"gpu:{num_gpus}", + ) + else: + return run.SlurmExecutor( + account=slurm_cfg.account, + partition=slurm_cfg.partition_cpu, + nodes=nodes, + tunnel=tunnel, + container_image=slurm_cfg.container_image, + container_mounts=slurm_cfg.container_mounts, + time=slurm_cfg.time, + packager=run.GitArchivePackager(), + mem="0", + ) + + +def get_finetune_recipe(recipe_name: str): + if not hasattr(getattr(llm, recipe_name), "finetune_recipe"): + raise ValueError(f"Recipe {recipe_name} does not have a Fine-Tuning recipe") + return getattr(llm, recipe_name).finetune_recipe(peft_scheme=None) + + +def read_chat_template(template_path: str): + with open(template_path) as f: + return f.read().strip() + + +def download_hf_dataset(dataset_name: str, output_dir: str | None = None): + """Download a dataset from HuggingFace Hub using huggingface-cli.""" + cmd = ["huggingface-cli", "download", dataset_name, "--repo-type", "dataset"] + + if output_dir: + cmd.extend(["--local-dir", output_dir]) + + subprocess.run(cmd, check=True) + print(f"Successfully downloaded dataset: {dataset_name}") diff --git a/examples/nemo_run/qat/ADVANCED.md b/examples/nemo_run/qat/ADVANCED.md new file mode 100644 index 000000000..72629a603 --- /dev/null +++ b/examples/nemo_run/qat/ADVANCED.md @@ -0,0 +1,56 @@ +# NeMo QAT/QAD Flow: Advanced Topics + +If you need to run QAT/QAD on a Slurm cluster (for example to use more than 1 node), this guide covers how to configure and launch on Slurm. + +To run the example on slurm, edit the `SLURM_CONFIG` at the bottom of `nemo_qat_flow.py` with the appropriate credentials, container, cluster name (host), and container mounts. Make sure you are mounting the NeMo and Megatron-LM repositories above in the Slurm cluster and that you've checked out the correct commits. + +## Running the Flow on Slurm + +To launch the Flow on a Slurm cluster, modify your Slurm credentials at the bottom of `nemo_qat_flow.py` and add the `--use-slurm` flag to the command. On a different server (e.g. your local server), launch the NeMo container as described in the [README](README.md) then run `python qat/nemo_qat_flow.py --use-slurm --log-dir /slurm/log/dir`, which will `ssh` into the Slurm cluster, `rsync` your files over, and launch the tasks. The log directory on the Slurm cluster should look like this after an experiment is run (assuming your experiment name is `qat_flow_ckpts`) + +```bash +qat_flow_ckpts qat_flow_ckpts_1755708286 +``` + +If you `cd` into the experiment itself, e.g. `cd qat_flow_ckpts_1755708286`, you'll find a directory structure like the following. Each folder is for a stage of the Simplified Flow, and in each stage you can see the logs for that stage as well as the sbatch command that was run. You can `cd` into each stage and `tail -f` the log file to see the logs while the stage is running. + +```bash +├── 00_openscience_data +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.00_openscience_data_5345664_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.00_openscience_data_5345664.out +├── 01_import_model +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.01_import_model_5345665_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.01_import_model_5345665.out +├── 02_mmlu_bf16 +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.02_mmlu_bf16_5345666_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.02_mmlu_bf16_5345666.out +├── 03_ptq +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.03_ptq_5345667_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.03_ptq_5345667.out +├── 04_mmlu_ptq +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.04_mmlu_ptq_5345668_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.04_mmlu_ptq_5345668.out +├── 05_train +│   ├── code +│   ├── configs +│   ├── log-coreai_dlalgo_modelopt-modelopt.05_train_5345669_0.out +│   └── sbatch_coreai_dlalgo_modelopt-modelopt.05_train_5345669.out +├── 06_mmlu_sft +│   ├── code +│   └── configs +├── 07_export_hf +│   ├── code +│   └── configs +``` + +**NOTE:** `rsync` may not currently be available in the NeMo container and will be added as a dependency. diff --git a/examples/nemo_run/qat/README.md b/examples/nemo_run/qat/README.md index 3cecf7c62..79715953c 100644 --- a/examples/nemo_run/qat/README.md +++ b/examples/nemo_run/qat/README.md @@ -1,35 +1,86 @@ +
+ # NeMo QAT/QAD Simplified Flow Example +[Slurm Examples](ADVANCED.md) | +[Advanced Topics](ADVANCED.md) | +[NeMo Integration](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/llm/modelopt) + +
+ ## Overview -This directory also contains an end-to-end NeMo QAT Simplified Flow example, which supports both QAT with cross-entropy loss and QAD (quantization-aware distillation) with knowledge-distillation loss between the full-precision teacher and quantized student models. +This directory contains an end-to-end QAT Simplified Flow example using NeMo for model training. It supports both QAT with cross-entropy loss and QAD (quantization-aware distillation) with knowledge-distillation loss between the BF16 teacher and quantized student models. + +After PTQ (post-training quantization), the quantized model may show some accuracy degradation on tasks like MMLU; the QAT/QAD stages aim to recover that loss. + +## Flow Stages + +The Simplified Flow runs the following steps in order: + +1. 00_openscience_data — Process NVIDIA/OpenScience data (skipped if `--data-path` is given) +1. 01_import_model — Import NeMo BF16 model checkpoint +1. 02_mmlu_bf16 — Evaluate 5% MMLU on BF16 checkpoint +1. 03_ptq — Apply PTQ +1. 04_mmlu_ptq — Evaluate 5% MMLU on PTQ checkpoint +1. 05_train — SFT/QAT (and optional QAD) +1. 06_mmlu_sft — Evaluate 5% MMLU on SFT/QAT checkpoint +1. 07_export_hf — Export to Hugging Face (Unified) format + +```mermaid +graph TD; +00_openscience_data-->05_train; +01_import_model-->02_mmlu_bf16; +01_import_model-->03_ptq; +03_ptq-->04_mmlu_ptq; +03_ptq-->05_train; +05_train-->06_mmlu_sft; +05_train-->07_export_hf; +``` + +## Results + +QAT of Qwen3-8B NVFP4 recovers most of the accuracy on the MMLU benchmark after NVFP4 PTQ. We finetune the Qwen3-8B NVFP4 checkpoint for 200 steps with a learning rate of 1e-5 and global batch size of 512 on one node of 8 x H100 GPUs. + +| | MMLU 5% | +|---------------------------|---------| +| Qwen3-8B FP16 | 73.8 | +| Qwen3-8B NVFP4 | 70.3 | +| Qwen3-8B NVFP4 after QAT | 72.8 | + +The resulting exported checkpoint also is much smaller in memory at 6.4GB compared to the original BF16 checkpoint which is 16.4 GB. ## Usage ### Prerequisites -To run the example, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.04.01 or higher using Docker/Slurm. Mount your cloned `modelopt` repository to the container by adding this mount flag to your Docker/Slurm command: `-v :/workspace/modelopt -v /modelopt:/usr/local/lib/python3.12/dist-packages/modelopt`. +You can run the example either locally or on a [Slurm cluster](ADVANCED.md). -To run SFT properly you may also need to clone NeMo and Megatron-LM at the respective commits, and mount to `/opt/NeMo` and `/opt/megatron-lm`: +To run the example locally, launch a [NeMo container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo) with version 25.07 or higher. Clone the `TensorRT-Model-Optimizer` repository and `NeMo` repository (checkout a specific commit for NeMo), then mount it onto your docker container. -- `git clone https://github.com/NVIDIA-NeMo/NeMo && cd NeMo && git checkout d7b87b1` -- `git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout 8c15450` +- `git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git` +- `git clone https://github.com/NVIDIA-NeMo/NeMo.git && cd NeMo && git checkout 676ed1a` -### Running the Flow +Example docker command: + +```bash +docker run -v /home/user/:/home/user/ -v /home/user/NeMo:/opt/NeMo -v /home/user/TensorRT-Model-Optimizer/modelopt/:/usr/local/lib/python3.12/dist-packages/modelopt --gpus all -it --shm-size 20g --rm nvcr.io/nvidia/nemo:25.07 bash +``` + +You will also need to set your Huggingface token with `export HF_TOKEN=`. You may also need to enable write access to the docker container to the `examples/nemo_run` folder by doing `chmod 777 nemo_run` so that logs can be written. + +### Running the Flow Locally + +After launching the NeMo container with the specified mounts, follow these examples to run the flow locally. #### QAT -From the `nemo_run` folder, launch the example with `python qat/nemo_qat_flow.py --model-name --finetune-recipe `. Available NeMo recipe names are listed [here](https://github.com/NVIDIA-NeMo/NeMo/tree/main/nemo/collections/llm/recipes). To provide your own custom dataset, use the `--data-path` flag, otherwise the default [LIMA](https://huggingface.co/datasets/GAIR/lima) dataset will be used. +From the `nemo_run` folder, launch the example with the `qat/nemo_qat_flow.py` script. To use a different model than the default model (Qwen3-8B), you can add the `--model-name --finetune-recipe ` flags and use the model's HuggingFace name and NeMo recipe names listed [here](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/llm/recipes). To provide your own custom dataset, use the `--data-path` flag, otherwise the default [NVIDIA OpenScience](https://huggingface.co/datasets/nvidia/OpenScience) dataset will be used. To perform QAT, run: ```bash -python qat/nemo_qat_flow.py \ - --model-name meta-llama/Meta-Llama-3.1-8B-Instruct \ - --finetune-recipe llama31_8b \ - --algorithm fp8 \ - --chat-template llama_chat_template.txt \ - --experiment llama3_qat_nemo +python qat/nemo_qat_flow.py --log-dir /my/log/dir --experiment qat_experiment ``` > **_NOTE:_** To enable KV cache quantization, add `--enable-kv-cache` and specify qformat using `--kv-cache-qformat `. @@ -41,31 +92,26 @@ In order to train using QAD, launch the example with `python qat/nemo_qat_flow.p To perform QAD training, run: ```bash -python qat/nemo_qat_flow.py \ - --model-name meta-llama/Meta-Llama-3.1-8B-Instruct \ - --distill \ - --algorithm fp8 \ - --chat-template llama_chat_template.txt \ - --experiment llama3_qad_nemo +python qat/nemo_qat_flow.py --distill --log-dir /my/log/dir --experiment qad_experiment ``` -### Custom Chat Template +## Supported models -By default the script will use the model/tokenizer's chat template, which may not contain the `{% generation %}` and `{% endgeneration %}` tags around the assistant tokens which are needed to generate the assistant loss mask (see [this PR](https://github.com/huggingface/transformers/pull/30650)). To provide path to a custom chat template, use the `--chat-template ` flag. +Locally this script currently supports models that can be trained on 1 node with 8 x 80GB GPUs. On Slurm you can configure the number of nodes/gpus for training and PTQ with the following flags: `--train-nodes`, `--train-gpus`, `--ptq-gpus`. -## Flow Stages +The default configuration works on 1 node with 4 H100 GPUs for PTQ and 8 H100 GPUs for training with the following model: -Currently the Simplified Flow runs the following steps in order: +- **Model**: Qwen3-8B +- **Recipe**: qwen3_8b -1. Process LIMA data (if `--data-path` is not specified) -1. Import NeMo model checkpoint -1. PTQ the model -1. SFT (finetune) the model -1. Export model to Unified checkpoint (HuggingFace) format +### Common Errors -## Supported models +Depending on the amount of memory your GPUs have, you may get an Out of Memory error. If that happens, add flags for `--tensor_parallelism` or `--pipeline_parallelism` (e.g. `--tensor_parallelism 2`). + +### Custom Chat Template + +By default the script will use the model/tokenizer's chat template, which may not contain the `{% generation %}` and `{% endgeneration %}` tags around the assistant tokens which are needed to generate the assistant loss mask (see [this PR](https://github.com/huggingface/transformers/pull/30650)). To provide path to a custom chat template, use the `--chat-template ` flag. -Currently supports models that can be trained on 1 node with 8 x 80GB GPUs. The default configuration uses: +### Dataset limitations -- **Model**: Meta-Llama-3.1-8B-Instruct -- **Recipe**: llama31_8b +The current QAT recipe has been tuned for the Qwen3-8B model to improve accuracy on the MMLU benchmark after PTQ degradation. QAT/QAD results are highly dependent on the specific model, dataset, and hyperparameters. There is no guarantee that the same dataset will recover the accuracy of the PTQ model. Feel free to try your own model and dataset combinations and test which combination works best. diff --git a/examples/nemo_run/qat/nemo_qat_flow.py b/examples/nemo_run/qat/nemo_qat_flow.py index 5b1894108..df921bd19 100644 --- a/examples/nemo_run/qat/nemo_qat_flow.py +++ b/examples/nemo_run/qat/nemo_qat_flow.py @@ -15,21 +15,26 @@ import argparse import os -from pathlib import Path +import sys import nemo_run as run from nemo.collections import llm -from nemo.collections.llm.api import export_ckpt from nemo.collections.llm.gpt.data.chat import ChatDataModule from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import get_quant_cfg_choices from nemo.collections.llm.modelopt.recipes.distillation_recipe import distillation_recipe from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.utils import logging +from modelopt.torch.export.plugins.nemo_run import export_most_recent_ckpt -def get_parser(): +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "common"))) +from utils import SlurmConfig, create_slurm_executor, get_finetune_recipe, read_chat_template + + +def get_args(): parser = argparse.ArgumentParser( - description="NeMo2.0 QAT/QAD simplified flow. Currently supports running model locally on 1 node with 8 GPUs." + description="""NeMo2.0 QAT/QAD simplified flow. Supports running model locally on 1 node with 8 GPUs + or on a Slurm cluster with 1 or more nodes. Runs QAT on Qwen3-8B NVFP4 with the + nvidia/OpenScience dataset by default.""" ) quant_cfg_choices_list = ["no_quant", *get_quant_cfg_choices()] @@ -37,12 +42,12 @@ def get_parser(): "--model-name", type=str, help="Name of the HF model", - default="meta-llama/Meta-Llama-3.1-8B-Instruct", + default="Qwen/Qwen3-8B", ) parser.add_argument( "--finetune-recipe", type=str, - default="llama31_8b", + default="qwen3_8b", help=( "Choose NeMo 2.0 recipe. Recipes are named in the format of " "_(_ or other special settings)" @@ -53,6 +58,12 @@ def get_parser(): type=str, help="Path to the finetuning chat dataset. Can be either ShareGPT or HuggingFace/OpenAI chat format", ) + parser.add_argument( + "--learning-rate", + type=float, + help="Learning rate", + default=1e-5, + ) parser.add_argument( "--distill", action="store_true", @@ -71,15 +82,14 @@ def get_parser(): required=False, ) parser.add_argument( - "-algo", "--algorithm", type=str, - default="fp8", + default="nvfp4", choices=quant_cfg_choices_list, help="TensorRT-Model-Optimizer quantization algorithm", ) parser.add_argument( - "--slurm", + "--use-slurm", action="store_true", help="Run on slurm using run.SlurmExecutor", default=False, @@ -91,11 +101,32 @@ def get_parser(): default="qat_flow_ckpts", ) parser.add_argument( - "--ptq_gpus", + "--log-dir", + type=str, + help=( + "Path to the directory to store logs. Best to pass in a non-relative path so that " + "artifacts are stored in one location." + ), + default="logs", + ) + parser.add_argument( + "--ptq-gpus", type=int, help="Number of GPUs for quantization. Some models require a different number of GPUs for PTQ vs training.", + default=4, + ) + parser.add_argument( + "--train-gpus", + type=int, + help="Number of GPUs for training", default=8, ) + parser.add_argument( + "--train-nodes", + type=int, + help="Number of nodes for training. Does not apply to PTQ (assumes model will fit in 1 node)", + default=1, + ) parser.add_argument( "--kv-cache-qformat", type=str, @@ -104,91 +135,58 @@ def get_parser(): help="KV-cache quantization format", ) parser.add_argument( - "--enable_kv_cache", help="Enables KV-cache quantization", action="store_true" - ) - parser.add_argument("--disable_kv_cache", dest="enable_kv_cache", action="store_false") - parser.set_defaults(enable_kv_cache=None) - return parser - - -def get_finetune_recipe(recipe): - assert hasattr(llm, recipe), ( - f"Recipe named {recipe} not found. General format is _(_ " - "or other special settings)" + "--enable_kv_cache", + help="Enables KV-cache quantization", + action="store_true", + default=False, ) - finetune_recipe = getattr(llm, recipe).finetune_recipe - return finetune_recipe(peft_scheme=None) # TODO add dir - - -def get_most_recent_subdir(directory: str): - """ - Find the most recent subdirectory in a given directory. - - Args: - directory (str): Path to the directory to search in - - Returns: - str: Path to the most recent subdirectory, or None if no subdirectories exist - """ - dir_path = Path(directory) - # Get all subdirectories - subdirs = [d for d in dir_path.iterdir() if d.is_dir()] - if not subdirs: - return None + parser.add_argument("--tensor_parallelism", type=int, default=2) + parser.add_argument("--pipeline_parallelism", type=int, default=1) + return parser.parse_args() - # Sort by modification time (most recent first) - most_recent = max(subdirs, key=lambda x: x.stat().st_mtime) - return str(most_recent) - -def export_most_recent_ckpt(exp_dir: str, output_path: str): - """ - Args: - exp_dir: experiment directory - output_path: path to write exported model - """ - most_recent_exp = get_most_recent_subdir(f"{exp_dir}/default/") - if "checkpoints" in most_recent_exp: - most_recent_ckpt = most_recent_exp - else: - most_recent_ckpt = get_most_recent_subdir(f"{most_recent_exp}/checkpoints/") - logging.info(f"Exporting checkpoint from {most_recent_ckpt}") - export_ckpt(most_recent_ckpt, "hf", output_path) - - -def _read_chat_template(template_path: str): - with open(template_path) as f: - return f.read().strip() - - -if __name__ == "__main__": - args = get_parser().parse_args() +def main(args): if not args.distill and not args.finetune_recipe: raise ValueError("If distillation is not used, --finetune-recipe must be specified") model_name = args.finetune_recipe model_module = getattr(llm, model_name) if not model_name: model_name = os.path.basename(args.model_name) + exp_dir = f"{args.log_dir.rstrip('/')}/{args.experiment}" # 1. Process data - lima_data = run.Script("process_lima.py", entrypoint="python") + # TODO figure out path + # LOCALLY common/process.py works + # On slurm examples/nemo_run/common/process.py works + + openscience_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../common/process_openscience.py") + ) + openscience_data = run.Script( + openscience_path + if not args.use_slurm + else "examples/nemo_run/common/process_openscience.py", + entrypoint="python", + args=["--output-dir", exp_dir], + ) # 2. Import Model - nemo_ckpt_path = f"{model_name}-nemo" + bf16_ckpt_path = f"{exp_dir}/{model_name}-nemo" import_model = run.Partial( llm.import_ckpt, model=model_module.model(), source=f"hf://{args.model_name}", - output_path=nemo_ckpt_path, + output_path=bf16_ckpt_path, + overwrite=True, ) # 3. PTQ - ptq_model_out = f"{model_name}-{args.algorithm}" + ptq_model_out = f"{exp_dir}/{model_name}-{args.algorithm}" ptq = run.Script( "/opt/NeMo/scripts/llm/ptq.py", args=[ "-nc", - nemo_ckpt_path, + bf16_ckpt_path, "-out", ptq_model_out, "--export_format", @@ -206,64 +204,179 @@ def _read_chat_template(template_path: str): # 4. Train if not args.hf_tokenizer: - tokenizer_path = os.path.join(nemo_ckpt_path, "context/nemo_tokenizer") + tokenizer_path = os.path.join(bf16_ckpt_path, "context/nemo_tokenizer") tokenizer = run.Config( get_nmt_tokenizer, library="huggingface", model_name=tokenizer_path, - chat_template=_read_chat_template(args.chat_template) if args.chat_template else None, + chat_template=read_chat_template(args.chat_template) if args.chat_template else None, ) else: tokenizer = run.Config( get_nmt_tokenizer, library="huggingface", model_name=args.hf_tokenizer, - chat_template=_read_chat_template(args.chat_template) if args.chat_template else None, + chat_template=read_chat_template(args.chat_template) if args.chat_template else None, ) - data_path = args.data_path if args.data_path is not None else "lima_processed" + data_path = args.data_path if args.data_path is not None else f"{exp_dir}/openscience_proc" data = run.Config( ChatDataModule, dataset_root=data_path, - seq_length=4096, + seq_length=SEQUENCE_LENGTH, tokenizer=tokenizer, - global_batch_size=64, - micro_batch_size=1, + global_batch_size=GBS, + micro_batch_size=MBS, use_hf_tokenizer_chat_template=True, + num_workers=2, + persistent_workers=True, ) if args.distill: - train = distillation_recipe(ptq_model_out, nemo_ckpt_path) + train = distillation_recipe(ptq_model_out, bf16_ckpt_path) else: train = get_finetune_recipe(args.finetune_recipe) train.resume.restore_config.path = ptq_model_out + train.optim.config.lr = args.learning_rate train.tokenizer = "data" train.data = data - train.log.log_dir = args.experiment - train.trainer.val_check_interval = 200 - train.trainer.max_steps = 200 + train.log.log_dir = exp_dir + train.trainer.val_check_interval = VAL_INTERVAL + train.trainer.max_steps = TRAIN_STEPS + train.trainer.devices = args.train_gpus + train.trainer.num_nodes = args.train_nodes + train.trainer.limit_val_batches = 32 + train.trainer.strategy.tensor_model_parallel_size = args.tensor_parallelism + train.trainer.strategy.pipeline_model_parallel_size = args.pipeline_parallelism # 5. Export export = run.Partial( - export_most_recent_ckpt, exp_dir=train.log.log_dir, output_path=f"{model_name}_hf" + export_most_recent_ckpt, train.log.log_dir, output_path=f"{exp_dir}/{model_name}_hf" + ) + # 6. Evaluate MMLU + + mmlu_script_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../common/in_memory_mmlu.py") + ) + if args.use_slurm: + mmlu_script_path = "examples/nemo_run/common/in_memory_mmlu.py" + eval_ptq = run.Script( + mmlu_script_path, + args=["--nemo_ckpt", ptq_model_out, "--tensor_parallelism", f"{args.ptq_gpus}"], + entrypoint="python", + ) + eval_bf16 = run.Script( + mmlu_script_path, + args=["--nemo_ckpt", bf16_ckpt_path, "--tensor_parallelism", f"{args.ptq_gpus}"], + entrypoint="python", + ) + eval_sft = run.Script( + mmlu_script_path, + args=["--finetuned_ckpt_dir", exp_dir, "--tensor_parallelism", f"{args.ptq_gpus}"], + entrypoint="python", ) - with run.Experiment(args.experiment, log_level="INFO") as exp: - ptq_executor = run.LocalExecutor(ntasks_per_node=args.ptq_gpus, launcher="torchrun") + if args.use_slurm: + cpu_executor = create_slurm_executor(SLURM_CONFIG) + ptq_gpu_executor = create_slurm_executor( + SLURM_CONFIG, num_gpus=args.ptq_gpus, ntasks_per_node=args.ptq_gpus + ) + train_gpu_executor = create_slurm_executor( + SLURM_CONFIG, num_gpus=args.train_gpus, ntasks_per_node=args.train_gpus + ) + single_gpu_executor = create_slurm_executor(SLURM_CONFIG, num_gpus=1, ntasks_per_node=1) + else: + cpu_executor = single_gpu_executor = run.LocalExecutor() + ptq_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.ptq_gpus) + train_gpu_executor = run.LocalExecutor(launcher="torchrun", ntasks_per_node=args.train_gpus) + + with run.Experiment(exp_dir, log_level="INFO") as exp: if not args.data_path: - s0 = exp.add(lima_data, tail_logs=True, name="lima_data", executor=run.LocalExecutor()) + s0 = exp.add( + openscience_data, tail_logs=True, name="00_openscience_data", executor=cpu_executor + ) + # 1. Import BF16 model and evaluate MMLU s1 = exp.add( - import_model, tail_logs=True, name="import_model", executor=run.LocalExecutor() + import_model, tail_logs=True, name="01_import_model", executor=single_gpu_executor + ) + exp.add( + eval_bf16, + tail_logs=True, + name="02_mmlu_bf16", + executor=ptq_gpu_executor, + dependencies=[s1], + ) + + # 2. PTQ model and evaluate PTQ model + s2 = exp.add( + ptq, tail_logs=True, name="03_ptq", executor=ptq_gpu_executor, dependencies=[s1] ) - s2 = exp.add(ptq, tail_logs=True, name="ptq", executor=ptq_executor, dependencies=[s1]) - train_executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") s3 = exp.add( - train, tail_logs=True, name="train", executor=train_executor, dependencies=[s2] + eval_ptq, + tail_logs=True, + name="04_mmlu_ptq", + executor=ptq_gpu_executor, + dependencies=[s2], ) + # 3. Train PTQ model (QAT or QAD) + train_dep = [s3] + if not args.data_path: + train_dep.append(s0) s4 = exp.add( + train, + tail_logs=True, + name="05_train", + executor=train_gpu_executor, + dependencies=train_dep, + ) + s5 = exp.add( + eval_sft, + tail_logs=True, + name="06_mmlu_sft", + executor=ptq_gpu_executor, + dependencies=[s4], + ) + # WAR: Export needs access to all GPUs but only 1 task due to bug in NeMo + train_gpu_executor.ntasks_per_node = 1 # will throw error if more than 1 task during export + exp.add( export, tail_logs=True, - name="export_hf", - executor=run.LocalExecutor(), - dependencies=[s3], + name="07_export_hf", + executor=train_gpu_executor, + dependencies=[s5], + ) + exp.run(detach=True) + + +if __name__ == "__main__": + args = get_args() + + # # # # # # # # SLURM SETUP # # # # # # + # # # # # # MODIFY THIS # # # # # # # + if args.use_slurm: + SLURM_CONFIG = SlurmConfig( + account="", + partition_gpu="batch", + partition_cpu="cpu", + time="04:00:00", + container_image="nvcr.io/nvidia/nemo:25.07", + env_vars={ + "HF_TOKEN": "", + }, + use_local_tunnel=False, + host="", + user="", + container_mounts=[], + job_dir="/path/to/logs", + identity=None, ) - exp.run(detach=False) + + # # # # # # # # # # # # # # # # # # # # # # + # # # # # CONFIGURABLE PARAMETERS # # # # # + SEQUENCE_LENGTH = 4096 + MBS = 1 + GBS = 512 + TRAIN_STEPS = 200 + VAL_INTERVAL = 50 + # # # # # # # # # # # # # # # # # # # # # # + + main(args) diff --git a/modelopt/torch/export/plugins/nemo_run.py b/modelopt/torch/export/plugins/nemo_run.py new file mode 100644 index 000000000..63cd7fbe0 --- /dev/null +++ b/modelopt/torch/export/plugins/nemo_run.py @@ -0,0 +1,71 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Export functions for NeMo Run.""" + +from pathlib import Path + +from nemo.collections.llm.api import export_ckpt +from nemo.utils import logging + + +def export_most_recent_ckpt(directory: str, output_path: str): + """Export most recent checkpoint from a NeMo Run experiment directory.""" + most_recent_ckpt = _get_most_recent_ckpt(directory) + logging.info(f"Exporting most recent NeMo Run checkpoint: {most_recent_ckpt}") + export_ckpt( + most_recent_ckpt, + "hf", + output_path=output_path, + overwrite=True, + ) + + +def _get_most_recent_subdir(directory: Path): + # Get all subdirectories + subdirs = [d for d in directory.iterdir() if d.is_dir()] + if not subdirs: + raise ValueError(f"No subdirectories found in {directory}") + + # Sort by modification time (most recent first) + most_recent = max(subdirs, key=lambda x: x.stat().st_mtime) + + return most_recent + + +def _get_most_recent_ckpt(directory: str): + """Find the most recent checkpoint subdirectory in a given NeMo Run experiment directory. + + Args: + directory (str): Path to the directory to search in. + + Returns: + str: Path to the most recent subdirectory. + """ + exp_dir = Path(directory) / "default" + if not exp_dir.exists(): + raise FileNotFoundError(f"Experiment directory {exp_dir} does not exist") + + checkpoint_dir = exp_dir / "checkpoints" + if checkpoint_dir.exists(): + most_recent = _get_most_recent_subdir(checkpoint_dir) + else: + most_recent = _get_most_recent_subdir(exp_dir) + checkpoint_dir = most_recent / "checkpoints" + if not checkpoint_dir.exists(): + raise FileNotFoundError(f"Checkpoint directory {checkpoint_dir} does not exist") + most_recent = _get_most_recent_subdir(checkpoint_dir) + + return str(most_recent)