NVIDIA
diff --git a/‎examples/vllm_serve/README.md‎
Lines changed: 3 additions & 3 deletions b/‎examples/vllm_serve/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎modelopt/torch/export/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎modelopt/torch/export/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/torch/export/plugins/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎modelopt/torch/export/plugins/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎modelopt/torch/export/plugins/vllm_fakequant.py‎
Lines changed: 0 additions & 125 deletions b/‎modelopt/torch/export/plugins/vllm_fakequant.py‎
Lines changed: 0 additions & 125 deletions
diff --git a/‎modelopt/torch/export/plugins/vllm_fakequant_hf.py‎
Lines changed: 62 additions & 0 deletions b/‎modelopt/torch/export/plugins/vllm_fakequant_hf.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎modelopt/torch/export/plugins/vllm_fakequant_megatron.py‎
Lines changed: 120 additions & 0 deletions b/‎modelopt/torch/export/plugins/vllm_fakequant_megatron.py‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 1 addition & 9 deletions b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 1 addition & 9 deletions
@@ -57,10 +57,10 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=<model_name>,
 
 Overwrite the calibrated amax value with prepared values from either QAT/PTQ.
 
-Step 1: export the model with bf16 weights and amax values.
+Step 1: export the model with bf16 weights and amax values. To export the model:
 
-- For HF model set `export_bf16_weights_amax` to export the model with function `modelopt.torch.export.unified_export_hf.export_hf_checkpoint`.
-- For MCore model use `export_bf16_weights_amax` to export the model with function `modelopt.torch.export.unified_export_megatron.export_mcore_gpt_to_hf`.
+- For HF model use `modelopt.torch.export.export_hf_vllm_fq_checkpoint` function.
+- For MCore model use `modelopt.torch.export.export_mcore_gpt_to_hf_vllm_fq` function.
 
 Step 2: configure <quant_amax.pth> from exported model using AMAX_FILE_PATH environment variable in step 1. For example:
 
 
@@ -19,6 +19,7 @@
 from .model_config import *
 from .model_config_export import *
 from .model_utils import *
+from .plugins import *
 from .transformer_engine import *
 from .unified_export_hf import *
 from .unified_export_megatron import *
@@ -21,3 +21,7 @@
     from .megatron_importer import *
 
 from .hf_spec_export import *
+from .vllm_fakequant_hf import *
+
+with import_plugin("vllm_fakequant_megatron"):
+    from .vllm_fakequant_megatron import *
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export HuggingFace model to vLLM fakequant checkpoint."""
+
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+
+from modelopt.torch.export.layer_utils import is_quantlinear
+from modelopt.torch.quantization.utils import get_quantizer_state_dict
+
+__all__ = ["export_hf_vllm_fq_checkpoint"]
+
+
+def export_hf_vllm_fq_checkpoint(
+    model: nn.Module,
+    export_dir: Path | str,
+):
+    """Exports the torch model weights and amax values separately.
+
+    This function:
+    1. Extracts amax values for calibration
+    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
+    3. Saves the model weights
+
+    Args:
+        model: The quantized model to export
+        export_dir: Directory to save the amax values
+
+    """
+    export_dir = Path(export_dir)
+    export_dir.mkdir(parents=True, exist_ok=True)
+
+    amax_dict = {
+        name + "._amax": param["_amax"].detach().clone().cpu()
+        for name, param in get_quantizer_state_dict(model).items()
+        if "_amax" in param
+    }
+
+    # remove quantizer from model
+    for _, module in model.named_modules():
+        if is_quantlinear(module):
+            for attr in ["weight_quantizer", "input_quantizer", "output_quantizer"]:
+                if hasattr(module, attr):
+                    delattr(module, attr)
+            module.export()
+    torch.save(amax_dict, f"{export_dir}/quant_amax.pth")
+    # Save model
+    model.save_pretrained(export_dir, state_dict=model.state_dict(), save_modelopt_state=False)
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export Megatron Core Model to HuggingFace vLLM fakequant checkpoint."""
+
+import os
+import tempfile
+from pathlib import Path
+
+import torch
+
+from modelopt.torch.export.model_config import QUANTIZATION_NONE
+from modelopt.torch.export.unified_export_megatron import GPTModelExporter
+
+__all__ = ["export_mcore_gpt_to_hf_vllm_fq"]
+
+
+def gather_mcore_vllm_fq_quantized_state_dict(
+    model, state_dict: dict[str, torch.Tensor], save_directory: str | os.PathLike
+):
+    """Gather all quantized state dict from all ranks and save them to a file.
+
+    Args:
+        state_dict: The state dictionary of the module.
+        save_directory: The directory to save the quantized state dict.
+
+    Returns:
+        The state dictionary of the module without quantized state.
+    """
+    amax_state_dict = {
+        k: v.detach().clone().cpu() for k, v in state_dict.items() if k.endswith("_amax")
+    }
+
+    # Gather all amax dicts to rank 0
+    world_size = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
+
+    if rank == 0:
+        # Rank 0 will collect all amax values
+        all_amax_dicts = [None] * world_size
+        torch.distributed.gather_object(amax_state_dict, all_amax_dicts, dst=0)
+
+        # Merge all amax dicts into one
+        merged_amax_dict = {}
+        for amax_dict in all_amax_dicts:
+            if amax_dict is not None:
+                merged_amax_dict.update(amax_dict)
+
+        print(f"Total amax entries from all ranks: {len(merged_amax_dict.keys())}")
+        torch.save(merged_amax_dict, save_directory + "/quant_amax.pth")
+    else:
+        # Other ranks just send their amax values
+        torch.distributed.gather_object(amax_state_dict, None, dst=0)
+
+    torch.distributed.barrier()
+
+
+class VllmFqGPTModelExporter(GPTModelExporter):
+    """VLLM fakequant GPTModel exporter."""
+
+    def save_pretrained(
+        self,
+        save_directory: str | os.PathLike,
+        pretrained_model_name_or_path: str | os.PathLike | None = None,
+    ):
+        os.makedirs(save_directory, exist_ok=True)
+        gather_mcore_vllm_fq_quantized_state_dict(self.model, self.state_dict, save_directory)
+        assert not (self.is_multimodal and pretrained_model_name_or_path is not None), (
+            "Exporting weights in bf16 and amax values is not supported for multimodal models "
+            "when pretrained_model_name_or_path is not None"
+        )
+        assert not self.export_extra_modules, (
+            "Exporting extra modules is not supported for vLLM fakequant"
+        )
+        super().save_pretrained(save_directory, pretrained_model_name_or_path)
+
+    def _get_quantization_format(self, module: torch.nn.Module):
+        return QUANTIZATION_NONE
+
+
+def export_mcore_gpt_to_hf_vllm_fq(
+    model: torch.nn.Module,
+    pretrained_model_name_or_path: str | os.PathLike | None = None,
+    export_extra_modules: bool = False,
+    dtype: torch.dtype = torch.bfloat16,
+    export_dir: Path | str = tempfile.gettempdir(),
+    moe_router_dtype: torch.dtype | None = None,
+):
+    """Export Megatron Core GPTModel to unified checkpoint and save to export_dir.
+
+    Args:
+        model: The Megatron Core GPTModel instance.
+        pretrained_model_name_or_path: Can be either: the *model id* of a
+            pretrained model hosted inside a model repo on huggingface.co; or
+            a *directory* containing model weights saved using
+            [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+        export_extra_modules: If True, export extra modules like medusa_heads or
+            eagle_module. Otherwise, only export the base model.
+        dtype: The weights data type to export the unquantized layers.
+        export_dir: The target export path.
+    """
+    exporter = VllmFqGPTModelExporter(
+        model,
+        pretrained_model_name_or_path,
+        export_extra_modules=export_extra_modules,
+        dtype=dtype,
+        moe_router_dtype=moe_router_dtype,
+    )
+    exporter.save_pretrained(export_dir, pretrained_model_name_or_path)
@@ -59,7 +59,6 @@
 )
 from .model_utils import get_language_model_from_vl, is_multimodal_model
 from .plugins import export_spec_ckpt_config, export_spec_ckpt_state_dict, spec_opt_only
-from .plugins.vllm_fakequant import export_hf_vllm_fq_checkpoint
 from .quant_utils import (
     fuse_prequant_layernorm,
     fuse_prequant_to_linear,
@@ -559,7 +558,6 @@ def export_hf_checkpoint(
     dtype: torch.dtype | None = None,
     export_dir: Path | str = tempfile.gettempdir(),
     save_modelopt_state: bool = False,
-    export_vllm_fq_weights_qstate: bool = False,
 ):
     """Exports the torch model to unified checkpoint and saves to export_dir.
 
@@ -568,8 +566,6 @@ def export_hf_checkpoint(
         dtype: the weights data type to export the unquantized layers or the default model data type if None.
         export_dir: the target export path.
         save_modelopt_state: whether to save the modelopt state_dict.
-        export_vllm_fq_weights_qstate: whether to export the weights and quantization state separately for vLLM
-        fakequant serving.
     """
     export_dir = Path(export_dir)
     export_dir.mkdir(parents=True, exist_ok=True)
@@ -583,11 +579,7 @@ def export_hf_checkpoint(
         return
 
     try:
-        if export_vllm_fq_weights_qstate:
-            post_state_dict = export_hf_vllm_fq_checkpoint(model, export_dir)
-            hf_quant_config = None
-        else:
-            post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
 
         if hf_quant_config is not None:
             # Save hf_quant_config.json for\ backward compatibility