vllm-project · dsikka · Mar 10, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 14, 2026
diff --git a/examples/model_free_ptq/deepseek_r1_nvfp4_fp8_block.py b/examples/model_free_ptq/deepseek_r1_nvfp4_fp8_block.py
@@ -0,0 +1,54 @@
+from compressed_tensors.entrypoints.convert import (
+    ModelOptNvfp4Converter,
+)
+from compressed_tensors.quantization import (
+    QuantizationScheme,
+)
+from compressed_tensors.quantization.quant_scheme import FP8_BLOCK
+
+from llmcompressor import model_free_ptq
+
+MODEL_ID = "nvidia/DeepSeek-R1-NVFP4"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-BLOCK"
+
+
+# Convert modelopt NVFP4 format to compressed-tensors format and
+# apply FP8-Block to the model's compatible self_attn Linear layers
+# Once quantized, the model is saved to SAVE_DIR.
+model_free_ptq(
+    model_stub=MODEL_ID,
+    save_directory=SAVE_DIR,
+    scheme=QuantizationScheme(
+        **FP8_BLOCK,
+        targets=[
+            # Target fused layers, must have the same quant config
+            # shape 576x7168 is compatible with block size 128x128
+            #   - self_attn.kv_a_proj_with_mqa
+            #   - self_attn.q_a_proj
+            "re:.*self_attn.(kv_a_proj_with_mqa|q_a_proj)$",
+            # Skip self_attn.kv_b_proj, already dequantized by MLA
+            # Target remaining self_attn layers:
+            #   - self_attn.o_proj
+            #   - self_attn.q_b_proj
+            "re:.*self_attn.(o_proj|q_b_proj).*",
+        ],
+    ),
+    max_workers=8,
+    device="cuda:0",
+    converter=ModelOptNvfp4Converter(
+        targets=[
+            # nvidia/DeepSeek-R1-NVFP4's nvfp4-quantized layers, found by inspection
+            # - model.layers.0.mlp.down_proj.weight
+            # - model.layers.0.mlp.gate_proj.weight
+            # - model.layers.0.mlp.up_proj.weight
+            # - model.layers.3.mlp.shared_experts.down_proj.weight
+            # - model.layers.3.mlp.shared_experts.gate_proj.weight
+            # - model.layers.3.mlp.shared_experts.up_proj.weight
+            # - model.layers.3.mlp.experts.0.down_proj.weight
+            # - model.layers.3.mlp.experts.0.gate_proj.weight
+            # - model.layers.3.mlp.experts.0.up_proj.weight
+            # NOTE: gate_up_proj also needs to be targeted, gate/up are fused
+            "re:.*mlp.*(gate_up|gate|up|down)_proj$"
+        ]
+    ),
+)
@@ -1,30 +1,30 @@
 import os
 import shutil
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Iterable, Optional
 
 import torch
-import tqdm
+from compressed_tensors.entrypoints.convert import (
+    Converter,
+    exec_jobs,
+    get_checkpoint_files,
+    is_weights_file,
+    update_safetensors_index,
+)
 from compressed_tensors.quantization import QuantizationScheme
 from loguru import logger
 
 from llmcompressor.entrypoints.model_free.helpers import gpu_if_available
 from llmcompressor.entrypoints.model_free.microscale import (
     is_microscale_scheme,
 )
-from llmcompressor.entrypoints.model_free.model_utils import (
-    get_checkpoint_files,
-    is_weights_file,
-)
 from llmcompressor.entrypoints.model_free.process import (
     process_file,
     process_file_microscale_scheme,
     validate_file,
 )
 from llmcompressor.entrypoints.model_free.save_utils import (
     update_config,
-    update_safetensors_index,
 )
 from llmcompressor.entrypoints.model_free.validate import (
     validate_safetensors_index,
@@ -41,6 +41,7 @@ def model_free_ptq(
     ignore: Iterable[str] = tuple(),
     max_workers: int = 1,
     device: Optional[torch.device | str] = None,
+    converter: Converter | None = None,
 ):
     """
     Quantize a model without the need for a model definition. This function operates on
@@ -52,6 +53,10 @@ def model_free_ptq(
         ignored
     :param max_workers: number of worker threads to process files with
     :param device: gpu device to accelerate quantization with
+    :param converter: optional converter to apply to the checkpoint to convert it to
+        compressed-tensors format before running model-free PTQ
+        e.g. conversion of some layers from modelopt format to compressed-tensors
+        See compressed-tensors convert_checkpoint entrypoint for more information
     """
     # validate arguments
     model_files = get_checkpoint_files(model_stub)
@@ -70,7 +75,9 @@ def model_free_ptq(
         save_path = Path(save_directory) / file_path
 
         if file_path.endswith("safetensors"):
-            jobs.append((job_fn, resolved_path, save_path, scheme, ignore, device))
+            jobs.append(
+                (job_fn, resolved_path, save_path, scheme, ignore, device, converter)
+            )
 
         else:
             if is_weights_file(file_path):
@@ -79,25 +86,19 @@ def model_free_ptq(
             logger.info(f"Copying {file_path} {save_path}")
             shutil.copyfile(resolved_path, save_path)
 
-    with ThreadPoolExecutor(max_workers) as executor:
-        # 1. validate quantizable tensors fail fast before long-running quantization
-        futures = [executor.submit(validate_file, *job[1:]) for job in jobs]
-        for future in tqdm.tqdm(
-            as_completed(futures), total=len(futures), desc="Validating"
-        ):
-            future.result()
+    # 1. validate quantizable tensors fail fast before long-running quantization
+    exec_jobs(
+        [(validate_file, *job[1:]) for job in jobs], max_workers, desc="Validating"
+    )
 
-        # 2-5. quantize and compress weights
-        total_size = 0
-        weight_map = dict()
-        futures = [executor.submit(*job) for job in jobs]
-        for future in tqdm.tqdm(
-            as_completed(futures), total=len(futures), desc="Quantizing"
-        ):
-            _total_size, _weight_map = future.result()
-            total_size += _total_size
-            weight_map.update(_weight_map)
+    # 2-5. quantize and compress weights
+    total_size = 0
+    weight_map = dict()
+    quantize_results = exec_jobs(jobs, max_workers, desc="Quantizing")
+    for _total_size, _weight_map in quantize_results:
+        total_size += _total_size
+        weight_map.update(_weight_map)
 
     # 5. update config and safetensors index
-    update_config(save_directory, scheme_name, scheme, ignore)
+    update_config(save_directory, scheme_name, scheme, ignore, converter)
     update_safetensors_index(save_directory, total_size, weight_map)
@@ -1,17 +1,13 @@
-import os
 import re
 from collections import defaultdict
 from typing import Mapping, TypeVar
 
 import torch
 from compressed_tensors.utils.match import match_name
 from loguru import logger
-from transformers.file_utils import CONFIG_NAME
 
 __all__ = [
     "gpu_if_available",
-    "find_safetensors_index_path",
-    "find_config_path",
     "find_safetensors_index_file",
     "match_names_set_eager",
     "MatchedNamesSet",
@@ -43,22 +39,6 @@ def gpu_if_available(device: torch.device | str | None) -> torch.device:
         return torch.device("cpu")
 
 
-def find_safetensors_index_path(save_directory: str | os.PathLike) -> str | None:
-    for file_name in os.listdir(save_directory):
-        if file_name.endswith("safetensors.index.json"):
-            return os.path.join(save_directory, file_name)
-
-    return None
-
-
-def find_config_path(save_directory: str | os.PathLike) -> str | None:
-    for file_name in os.listdir(save_directory):
-        if file_name in (CONFIG_NAME, "params.json"):
-            return os.path.join(save_directory, file_name)
-
-    return None
-
-
 def find_safetensors_index_file(model_files: dict[str, str]) -> str | None:
     for file_path, resolved_path in model_files.items():
         if file_path.endswith("safetensors.index.json"):

@@ -1,11 +1,11 @@
 import os
 from collections import defaultdict
-from collections.abc import Iterator, Mapping
 from typing import Iterable
 
 import torch
+from compressed_tensors.entrypoints.convert import Converter
 from compressed_tensors.quantization import QuantizationScheme
-from compressed_tensors.utils.match import match_name
+from compressed_tensors.utils import match_quantizable_tensors
 from safetensors.torch import load_file, save_file
 from torch.nn import Module
 
@@ -21,21 +21,11 @@
     is_microscale_scheme,
 )
 
-__all__ = ["validate_file", "process_file", "process_file_microscale_scheme"]
-
-
-def iter_quantizable_tensors(
-    tensors: Mapping[str, torch.Tensor],
-    ignore: Iterable[str],
-) -> Iterator[tuple[str, str]]:
-    for name in list(tensors.keys()):
-        module_name, param_name = name.rsplit(".", 1)
-        is_linear_weight = param_name == "weight" and not module_name.endswith("norm")
-        is_ignored = any(match_name(module_name, ign) for ign in ignore)
-        if not is_linear_weight or is_ignored:
-            continue
-
-        yield module_name, name
+__all__ = [
+    "validate_file",
+    "process_file",
+    "process_file_microscale_scheme",
+]
 
 
 def validate_file(
@@ -44,6 +34,7 @@ def validate_file(
     scheme: QuantizationScheme,
     ignore: Iterable[str],
     device: str | torch.device,
+    converter: Converter | None = None,
 ):
     """
     Validate that each quantizable tensor in a safetensors file can be quantized.
@@ -52,10 +43,15 @@ def validate_file(
     :param scheme: quantization scheme to apply to tensors
     :param ignore: modules to ignore. Modules ending with "norm" are automatically
         ignored
+    :param converter: optional converter to apply to the checkpoint,
+        e.g. conversion of some layers from some format to compressed-tensors
     """
     tensors = load_file(file_path)
 
-    for _, name in iter_quantizable_tensors(tensors, ignore):
+    if converter is not None:
+        converter.validate(tensors)
+
+    for _, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
         validate_weight_for_quantization(tensors[name], scheme, name)
 
 
@@ -65,6 +61,7 @@ def process_file(
     scheme: QuantizationScheme,
     ignore: Iterable[str],
     device: str | torch.device,
+    converter: Converter | None = None,
 ) -> tuple[int, dict[str, str]]:
     """
     Quantize and compress tensors in a given safetensors file
@@ -75,11 +72,16 @@ def process_file(
     :param ignore: modules to ignore. Modules ending with "norm" are automatically
         ignored
     :param device: device used to quantize and compress weights
+    :param converter: optional converter to apply to the checkpoint,
+        e.g. conversion of some layers from some format to compressed-tensors
     """
     assert not is_microscale_scheme(scheme), "Use `_process_file_microscale_scheme`"
     tensors = load_file(file_path)
 
-    for module_name, name in iter_quantizable_tensors(tensors, ignore):
+    if converter is not None:
+        converter.process(tensors)
+
+    for module_name, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
         validate_weight_for_quantization(tensors[name], scheme, name)
 
         # 1. initialize module with qparams (on device)
@@ -109,6 +111,7 @@ def process_file_microscale_scheme(
     scheme: QuantizationScheme,
     ignore: Iterable[str],
     device: str | torch.device,
+    converter: Converter | None = None,
 ) -> tuple[int, dict[str, str]]:
     """
     Quantize and compress tensors in a given safetensors file
@@ -119,9 +122,15 @@ def process_file_microscale_scheme(
     :param ignore: modules to ignore. Modules ending with "norm" are automatically
         ignored
     :param device: device used to quantize and compress weights
+    :param converter: optional converter to apply to the checkpoint,
+        e.g. conversion of some layers from some format to compressed-tensors
     """
     assert is_microscale_scheme(scheme), "Use `_process_file` for non-microscale scheme"
     tensors = load_file(file_path)
+
+    if converter is not None:
+        converter.process(tensors)
+
     fused_sets, unmatched_sets = get_fused_names(tensors)
     assert len(unmatched_sets) <= 0  # should be caught by `validate_safetensors_index`
 
@@ -135,7 +144,7 @@ def process_file_microscale_scheme(
     }
     fused_modules = defaultdict(dict)
 
-    for module_name, name in iter_quantizable_tensors(tensors, ignore):
+    for module_name, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
         validate_weight_for_quantization(tensors[name], scheme, name)
 
         # 1. initialize module with qparams (on device)

@@ -7,6 +7,10 @@
 
 import torch
 import tqdm
+from compressed_tensors.entrypoints.convert import (
+    get_checkpoint_files,
+    is_weights_file,
+)
 from loguru import logger
 from safetensors.torch import load_file, save_file
 
@@ -15,10 +19,6 @@
     invert_mapping,
 )
 from llmcompressor.entrypoints.model_free.microscale import get_fused_names
-from llmcompressor.entrypoints.model_free.model_utils import (
-    get_checkpoint_files,
-    is_weights_file,
-)
 from llmcompressor.entrypoints.model_free.save_utils import update_safetensors_index