NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/llm_ptq/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/llm_ptq/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 33 additions & 52 deletions b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 33 additions & 52 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 66 additions & 15 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 66 additions & 15 deletions
diff --git a/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 35 additions & 14 deletions b/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 35 additions & 14 deletions
diff --git a/‎modelopt/torch/export/model_config.py‎
Lines changed: 4 additions & 0 deletions b/‎modelopt/torch/export/model_config.py‎
Lines changed: 4 additions & 0 deletions
@@ -10,6 +10,7 @@ Model Optimizer Changelog (Linux)
 
 **New Features**
 
+- New model support in the ``llm_ptq`` example: OpenAI Whisper.
 - Blockwise FP8 quantization support in unified model export.
 - Add quantization support to the Transformer Engine Linear module.
 - Add support for SVDQuant. Currently, only simulation is available; real deployment (for example, TensorRT deployment) support is coming soon.
 
@@ -115,6 +115,7 @@ InternLM2 | Yes | No | Yes | Yes<sup>3</sup> | -
 Exaone | Yes | Yes | Yes | Yes | -
 Minitron | Yes | Yes | Yes | Yes<sup>2</sup> | Yes
 T5 | Yes | Yes | Yes | Yes | -
+Whisper | Yes | No | No | No | -
 
 > *<sup>1.</sup>The w4a8_awq is an experimental quantization scheme that may result in a higher accuracy penalty.*
 
 
@@ -21,47 +21,6 @@
 
 from modelopt.torch.utils.image_processor import MllamaImageProcessor
 
-MODEL_NAME_PATTERN_MAP = {
-    "GPT2": "gpt",
-    "Mllama": "mllama",
-    "Llama": "llama",
-    "Mistral": "llama",
-    "GPTJ": "gptj",
-    "FalconForCausalLM": "falcon",
-    "RWForCausalLM": "falcon",
-    "baichuan": "baichuan",
-    "MPT": "mpt",
-    "Bloom": "bloom",
-    "ChatGLM": "chatglm",
-    "QWen": "qwen",
-    "RecurrentGemma": "recurrentgemma",
-    "Gemma2": "gemma2",
-    "Gemma": "gemma",
-    "phi3small": "phi3small",
-    "phi3": "phi3",
-    "PhiMoEForCausalLM": "phi3",
-    "phi": "phi",
-    "TLGv4ForCausalLM": "phi",
-    "MixtralForCausalLM": "llama",
-    "ArcticForCausalLM": "llama",
-    "StarCoder": "gpt",
-    "Dbrx": "dbrx",
-    "T5": "t5",
-    "Bart": "bart",
-    "GLM": "glm",
-    "InternLM2ForCausalLM": "internlm",
-    "ExaoneForCausalLM": "exaone",
-    "Nemotron": "gpt",
-    "Deepseek": "deepseek",
-}
-
-
-def get_model_type(model):
-    for k, v in MODEL_NAME_PATTERN_MAP.items():
-        if k.lower() in type(model).__name__.lower():
-            return v
-    return None
-
 
 def get_mode_type_from_engine_dir(engine_dir_str):
     # Split the path by '/' and get the last part
@@ -106,20 +65,36 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs):
     return tokenizer
 
 
-def get_processor(ckpt_path, device=None, trust_remote_code=False):
+def get_processor(ckpt_path, model_type, device=None, trust_remote_code=False):
     """
     Returns a :class:`modelopt.torch.utils.image_processor.MllamaImageProcessor` object.
     """
-    processor = AutoProcessor.from_pretrained(
-        ckpt_path,
-        padding_side="left",
-        trust_remote_code=trust_remote_code,
-    )
-    if processor.tokenizer.pad_token is None:
-        processor.tokenizer.pad_token = processor.tokenizer.eos_token
-    assert processor.tokenizer.pad_token is not None, f"Pad token for {ckpt_path} cannot be set!"
+    if model_type == "whisper":
+        processor = AutoProcessor.from_pretrained(
+            ckpt_path,
+            padding_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        if processor.tokenizer.pad_token is None:
+            processor.tokenizer.pad_token = processor.tokenizer.eos_token
+        assert processor.tokenizer.pad_token is not None, (
+            f"Pad token for {ckpt_path} cannot be set!"
+        )
 
-    return MllamaImageProcessor(processor, device)
+        return processor
+    elif model_type == "mllama":
+        processor = AutoProcessor.from_pretrained(
+            ckpt_path,
+            padding_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        if processor.tokenizer.pad_token is None:
+            processor.tokenizer.pad_token = processor.tokenizer.eos_token
+        assert processor.tokenizer.pad_token is not None, (
+            f"Pad token for {ckpt_path} cannot be set!"
+        )
+
+        return MllamaImageProcessor(processor, device)
 
 
 def get_dtype(dtype):
@@ -179,6 +154,12 @@ def get_model(ckpt_path, device="cuda", gpu_mem_percentage=0.8, trust_remote_cod
             model = AutoModelForSeq2SeqLM.from_pretrained(
                 ckpt_path, device_map=None, **model_kwargs
             ).to(device)
+        elif hf_config.model_type == "whisper":
+            from transformers import WhisperForConditionalGeneration
+
+            model = WhisperForConditionalGeneration.from_pretrained(
+                ckpt_path, device_map=device_map, **model_kwargs
+            )
         elif hf_config.model_type == "glm":
             from transformers import AutoModelForSeq2SeqLM
 
@@ -246,4 +227,4 @@ def is_model_on_gpu(model) -> bool:
 
 def is_enc_dec(model_type) -> bool:
     """Return if the model is a encoder-decoder model."""
-    return model_type in ["t5", "bart"]
+    return model_type in ["t5", "bart", "whisper"]
@@ -22,7 +22,7 @@
 import numpy as np
 import torch
 from example_utils import get_model, get_processor, get_tokenizer, is_enc_dec, is_model_on_gpu
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, WhisperProcessor
 
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
@@ -39,6 +39,7 @@
 )
 from modelopt.torch.utils.image_processor import MllamaImageProcessor
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
+from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
 from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
 
 RAND_SEED = 1234
@@ -210,7 +211,19 @@ def main(args):
         elif args.dataset != "scienceqa":
             raise ValueError("Only the scienceqa dataset is supported for the mllama model.")
         processor = get_processor(
-            args.pyt_ckpt_path, device, trust_remote_code=args.trust_remote_code
+            args.pyt_ckpt_path, model_type, device, trust_remote_code=args.trust_remote_code
+        )
+    elif model_type == "whisper":
+        if args.dataset is None:
+            args.dataset = "peoples_speech"
+            warnings.warn(
+                "Currently only the peoples_speech dataset is supported for the whisper model. "
+                "Overriding dataset to peoples_speech."
+            )
+        elif args.dataset != "peoples_speech":
+            raise ValueError("Only the peoples_speech dataset is supported for the whisper model.")
+        processor = get_processor(
+            args.pyt_ckpt_path, model_type, device, trust_remote_code=args.trust_remote_code
         )
     else:
         if args.dataset is None:
@@ -273,8 +286,25 @@ def main(args):
             # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio
             # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference.
             sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1
+            # Whisper model expects mel-spectrogram input features of length 3000
+            # Whisper model needs input of shape (batch_size, num_mel_bins, 3000)
+            # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float
+            # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size()
+            if model_type == "whisper":
+                max_sample_length = 3000
+                num_mel_bins = model.config.num_mel_bins
+                sample_input_single_batch = (
+                    torch.ones([1, num_mel_bins, max_sample_length], dtype=torch.float32).to(
+                        model.device
+                    )
+                    * 100
+                )
+            else:
+                sample_input_single_batch = None
             args.batch_size = get_max_batch_size(
-                model, sample_memory_usage_ratio=sample_memory_usage_ratio
+                model,
+                sample_memory_usage_ratio=sample_memory_usage_ratio,
+                sample_input_single_batch=sample_input_single_batch,
             )
             if args.batch_size > args.calib_size:
                 args.batch_size = args.calib_size
@@ -292,6 +322,17 @@ def main(args):
                 batch_size=args.batch_size,
                 num_samples=args.calib_size,
             )
+        elif model_type == "whisper":
+            assert processor is not None and isinstance(processor, WhisperProcessor), (
+                "The AutoProcessor must be set."
+            )
+            calib_dataloader, first_text = get_speech_dataset_dataloader(
+                dataset_name=args.dataset,
+                processor=processor,
+                batch_size=args.batch_size,
+                num_samples=args.calib_size,
+                device=device,
+            )
         else:
             assert tokenizer is not None and isinstance(
                 tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
@@ -347,30 +388,40 @@ def main(args):
                 quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}
 
         # Only run single sample for preview
-        input_ids = next(iter(calib_dataloader))["input_ids"][0:1]
-        generated_ids_before_ptq = model.generate(input_ids, max_new_tokens=100)
+        input_ids = next(iter(calib_dataloader))[
+            "input_features" if model_type == "whisper" else "input_ids"
+        ][0:1]
+        with torch.autocast("cuda"):
+            generated_ids_before_ptq = model.generate(input_ids, max_new_tokens=100)
 
-        model = quantize_model(model, quant_cfg, args, calib_dataloader)
-        if args.compress:
-            mtq.compress(model)
-        # Lets print the quantization summary
-        if args.verbose:
-            mtq.print_quant_summary(model)
+            model = quantize_model(model, quant_cfg, args, calib_dataloader)
+            if args.compress:
+                mtq.compress(model)
+            # Lets print the quantization summary
+            if args.verbose:
+                mtq.print_quant_summary(model)
 
-        # Run some samples
-        generated_ids_after_ptq = model.generate(input_ids, max_new_tokens=100)
+            # Run some samples
+            generated_ids_after_ptq = model.generate(input_ids, max_new_tokens=100)
 
         def input_decode(input_ids):
             if processor is not None and isinstance(processor, MllamaImageProcessor):
                 return processor.tokenizer.batch_decode(input_ids)
+            elif processor is not None and isinstance(processor, WhisperProcessor):
+                return first_text
             elif tokenizer is not None:
                 return tokenizer.batch_decode(input_ids)
             else:
                 raise ValueError("The processor or tokenizer must be set")
 
         def output_decode(generated_ids, input_shape):
-            if tokenizer is not None and is_enc_dec(model_type):
-                return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+            if is_enc_dec(model_type):
+                if processor is not None and isinstance(processor, WhisperProcessor):
+                    return processor.tokenizer.batch_decode(
+                        generated_ids, skip_special_tokens=True
+                    )[0]
+                elif tokenizer is not None:
+                    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
             elif processor is not None and isinstance(processor, MllamaImageProcessor):
                 return processor.tokenizer.batch_decode(generated_ids[:, input_shape:])
             elif tokenizer is not None:
 
@@ -221,6 +221,11 @@ def is_linear(module: nn.Module) -> bool:
     return any([k in type(module).__name__ for k in ["Linear", "Conv1D", "NormHead"]])
 
 
+def is_conv(module: nn.Module) -> bool:
+    """Returns whether the module is a convolutional layer."""
+    return "Conv" in type(module).__name__
+
+
 def is_embedding(module: nn.Module) -> bool:
     """Returns whether the module is an embedding layer."""
     module_type_name = type(module).__name__
@@ -644,6 +649,14 @@ def build_attention_config(
         assert k
         assert v
         qkv_modules = [q, k, v]
+        for layer in qkv_modules:
+            # Add the missing zero bias for Whisper model for export purpose
+            if layer.bias is None and q.bias is not None:
+                layer.bias = torch.nn.Parameter(
+                    torch.zeros(layer.weight.size(1), device=layer.weight.device),
+                    requires_grad=True,
+                )
+                print("Add missing zero bias for qkv modules for export purpose")
 
     config.qkv = build_qkv(qkv_modules, model_metadata_config, ext_config, tp_size=tp_size)
 
@@ -723,7 +736,7 @@ def _split_gate_from_fc(decoder_type, module, fc_name, fc_layer):
             "dense_h_to_4h",  # falcon, chatglm, bloom
             "linear_fc1",
             "w2",  # qwen
-            "fc1",  # phi, gemma
+            "fc1",  # phi, gemma, whisper
             "gate_up_proj",  # phi
             "wi_0",  # t5
             "wi",  # t5
@@ -739,7 +752,7 @@ def _split_gate_from_fc(decoder_type, module, fc_name, fc_layer):
             "down_proj",  # llama, baichuan, mpt, phi, recurrentgemma, nemotron, deepseek
             "linear_fc2",
             "proj",
-            "fc2",  # phi, gemma
+            "fc2",  # phi, gemma, whisper
             "wo",  # t5
         ]
     )
@@ -1288,21 +1301,29 @@ def build_decoder_config(
             for layer in sub_module.children():
                 combined_module.append(layer)
         module_layers = dict(combined_module.named_children())
-    elif decoder_type in ["bart"]:
-        # BartEncoderLayer, BartDecoderLayer have MLP component with no Module wrapper.
-        # creating a dummy module so that is_mlp may catch it.
-        bart_mlp_submodule_names = ["fc1", "fc2", "activation_fn"]
+    elif decoder_type in ["bart", "whisper"]:
+        if decoder_type == "whisper":
+            # Add max_position_embeddings for Whisper model
+            if model_metadata_config.get("enc_dec") == "enc":
+                config.max_position_embeddings = module.self_attn.config.max_source_positions
+            else:
+                config.max_position_embeddings = module.self_attn.config.max_target_positions
+        # BartEncoderLayer, BartDecoderLayer, WhisperEncoderLayer, WhisperDecoderLayer
+        # have MLP component with no Module wrapper.
+        # Create a dummy module so that is_mlp may catch it.
+        encdec_mlp_submodule_names = ["fc1", "fc2", "activation_fn", "activation_fn"]
         module_layers = dict(module.named_children())
 
-        class BartMLP(nn.Module):
+        class EncDecMLP(nn.Module):
             def __init__(self):
                 super().__init__()
 
-        bart_mlp_module = BartMLP()
-        for submodule_name in bart_mlp_submodule_names:
-            setattr(bart_mlp_module, submodule_name, getattr(module, submodule_name))
-            module_layers.pop(submodule_name)
-        module_layers.update({"MLP": bart_mlp_module})
+        encdec_mlp_module = EncDecMLP()
+        for submodule_name in encdec_mlp_submodule_names:
+            if submodule_name in module_layers:
+                setattr(encdec_mlp_module, submodule_name, getattr(module, submodule_name))
+                module_layers.pop(submodule_name)
+        module_layers.update({"MLP": encdec_mlp_module})
     else:
         module_layers = dict(module.named_children())
         if decoder_type in ["exaone"]:
@@ -1612,13 +1633,13 @@ def get_experts_linear_names(model: torch.nn.Module):
 
 def model_type_is_enc_dec(model_type):
     """Check if model_type is a enc-dec model."""
-    return model_type in ["t5", "bart"]
+    return model_type in ["t5", "bart", "whisper"]
 
 
 def get_enc_dec_models(hf_model, model_type):
     """Get the correct encoder, decoder from hf model."""
     assert model_type_is_enc_dec(model_type), "This encoder decoder model is not supported"
-    if model_type in "bart":
+    if model_type in ["bart", "whisper"]:
         return [("enc", hf_model.model.encoder), ("dec", hf_model.model.decoder)]
     else:
         return [("enc", hf_model.encoder), ("dec", hf_model.decoder)]
 
@@ -568,6 +568,10 @@ class ModelConfig:
     bos_token_id: int = None
     pad_token_id: int = None
 
+    # For whisper encoder feature extractor
+    conv1: ConvConfig = None
+    conv2: ConvConfig = None
+
     @property
     def vocab_size_padded(self):
         """Returns the padded vocab_size of the model rounds to the tensor_parallel."""