vllm-project · brian-dellabetta · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/examples/convert_checkpoint/deepseek32_fpblock_example.py b/examples/convert_checkpoint/deepseek32_fpblock_example.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from compressed_tensors.entrypoints.convert import (
+    convert_checkpoint,
+    FP8BlockDequantizer,
+)
+
+# deepseek-ai/DeepSeek-V3.2 checkpoint has layers that are quantized in the FP8
+# quant method's FP8_BLOCK scheme. This script will upconvert to bfloat16 so that
+# the model can be compressed in another configuration.
+MODEL_ID = "deepseek-ai/DeepSeek-V3.2"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-bf16"
+
+# Convert DeepSeek-V3.2 back to dense bfloat16 format
+convert_checkpoint(
+    model_stub=MODEL_ID,
+    save_directory=SAVE_DIR,
+    converter=FP8BlockDequantizer(
+        # `deepseek-ai/DeepSeek-V3.2` fp8-block-quantized layers, found by inspection
+        targets=[
+            r"re:.*mlp.*\.(gate_up|gate|up|down)_proj$",
+            r"re:.*self_attn.*\.(kv_b|o|q_a|q_b)_proj$",
+            r"re:.*self_attn.kv_a_proj_with_mqa$",
+            r"re:.*self_attn.indexer.(wk|wq_b)$",
+        ],
+    ),
+    max_workers=4,
+)
diff --git a/examples/convert_checkpoint/qwen3_fpblock_example.py b/examples/convert_checkpoint/qwen3_fpblock_example.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from compressed_tensors.entrypoints.convert import (
+    convert_checkpoint,
+    FP8BlockDequantizer,
+)
+
+MODEL_ID = "qwen-community/Qwen3-4B-FP8"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1].rstrip("-FP8")
+
+# Convert Qwen3-4B-FP8 back to dense bfloat16 format
+convert_checkpoint(
+    model_stub=MODEL_ID,
+    save_directory=SAVE_DIR,
+    converter=FP8BlockDequantizer(
+        # qwen-community/Qwen3-4B-FP8's fp8-block-quantized layers, found by inspection
+        targets=[
+            r"re:.*mlp.*\.(gate_up|gate|up|down)_proj$",
+            r"re:.*self_attn.*\.(q|k|v|o)_proj$",
+        ],
+        weight_block_size=[128, 128],
+    ),
+    max_workers=8,
+)
diff --git a/src/compressed_tensors/entrypoints/convert/convert_checkpoint.py b/src/compressed_tensors/entrypoints/convert/convert_checkpoint.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import json
 import os
 import shutil
 from collections.abc import Callable
@@ -13,8 +14,12 @@
     validate_file,
     write_checkpoint_quantization_config,
 )
-from compressed_tensors.entrypoints.convert.converters import Converter
+from compressed_tensors.entrypoints.convert.converters import (
+    Converter,
+    build_inverse_weight_maps,
+)
 from compressed_tensors.utils.safetensors_load import (
+    find_safetensors_index_file,
     get_checkpoint_files,
     is_weights_file,
     update_safetensors_index,
@@ -32,11 +37,14 @@ def convert_checkpoint(
     max_workers: int = 1,
 ):
     """
-    Convert a model checkpoint to compressed-tensors format without loading it up
-    in memory, instead operating directly on the model safetensors files. This
-    entrypoint operates on a model stub or folder containing weights saved in
-    safetensors files, and updates the corresponding quantization_config field in
-    the config.json. All additional files will be copied to new checkpoint.
+    Convert a model checkpoint to either:
+    - its equivalent quantized format in compressed-tensors
+    - the unquantized format
+    without loading it up in memory, instead operating directly on the model
+    safetensors files. This entrypoint operates on a model stub or folder containing
+    weights saved in safetensors files, and updates the corresponding
+    quantization_config field in the config.json. All additional files will be
+    copied to new checkpoint.
 
     :param model_stub: huggingface model hub or path to local weights files
     :param save_directory: new checkpoint will be saved in this directory.
@@ -45,38 +53,59 @@ def convert_checkpoint(
     :param converters: converter we wish to apply to the checkpoint,
         e.g. conversion of some layers from some format to compressed-tensors
     """
-    # validate arguments
+    # get all model_files for checkpoint
     model_files = get_checkpoint_files(model_stub)
 
-    # 0. collect safetensors files, copy files
+    # Read weight map from safetensors.index.json
+    index_file = find_safetensors_index_file(model_files)
+    with open(index_file, "r") as f:
+        weight_map: dict[str, str] = json.load(f)["weight_map"]
+
+    # Build inverse_weight_maps, so that each job knows how to load up every necessary
+    # weight and its dependencies
+    inverse_weight_maps = build_inverse_weight_maps(
+        weight_map=weight_map,
+        model_files=model_files,
+        converters=[converter],
+    )
+
+    # Build validation/conversion jobs, copy over any other file
     validate_jobs = []
     convert_jobs = []
     for file_path, resolved_path in model_files.items():
         save_path = Path(save_directory) / file_path
 
         if file_path.endswith("safetensors"):
-            validate_jobs.append((validate_file, resolved_path, converter))
-            convert_jobs.append((convert_file, resolved_path, save_path, converter))
+            assert (
+                file_path in inverse_weight_maps
+            ), f"Could not find inverse_weight_map for file {file_path}"
+            validate_jobs.append(
+                (validate_file, inverse_weight_maps[file_path], converter)
+            )
+            convert_jobs.append(
+                (convert_file, inverse_weight_maps[file_path], save_path, converter)
+            )
 
         else:
             if is_weights_file(file_path):
                 logger.warning(f"Skip processing for weights file {file_path}")
-            save_path.parent.mkdir(parents=True, exist_ok=True)
-            logger.info(f"Copying {file_path} {save_path}")
-            shutil.copyfile(resolved_path, save_path)
+            if str(resolved_path) != str(save_path):
+                save_path.parent.mkdir(parents=True, exist_ok=True)
+                logger.info(f"Copying {file_path} {save_path}")
+                shutil.copyfile(resolved_path, save_path)
 
-    # 1. validate quantizable tensors fail fast before long-running quantization
+    # Validate before long-running procssing job
     exec_jobs(validate_jobs, max_workers, desc="Validating")
 
-    # 2-5. quantize and compress weights
+    # Process weights, accumulating total bytes used and the new weight_map
     total_size = 0
     weight_map = dict()
     convert_results = exec_jobs(convert_jobs, max_workers, desc="Converting")
     for _total_size, _weight_map in convert_results:
         total_size += _total_size
         weight_map.update(_weight_map)
 
-    # 5. update config and safetensors index
+    # Update config and safetensors index
     write_checkpoint_quantization_config(save_directory, converter)
     update_safetensors_index(save_directory, total_size, weight_map)
 
@@ -93,6 +122,13 @@ def exec_jobs(
     :param desc: tqdm description
     """
     results = []
+
+    # For easier debugging, don't run single-threaded jobs via ThreadPoolExecutor
+    if max_workers == 1:
+        for job in tqdm.tqdm(jobs, desc=desc):
+            results.append(job[0](*job[1:]))
+        return results
+
     with ThreadPoolExecutor(max_workers) as executor:
         futures = [executor.submit(*job) for job in jobs]
         for future in tqdm.tqdm(as_completed(futures), total=len(futures), desc=desc):

diff --git a/src/compressed_tensors/entrypoints/convert/convert_file.py b/src/compressed_tensors/entrypoints/convert/convert_file.py
@@ -7,9 +7,13 @@
 from compressed_tensors import __version__ as ct_version
 from compressed_tensors.base import COMPRESSION_VERSION_NAME, QUANTIZATION_CONFIG_NAME
 from compressed_tensors.entrypoints.convert import Converter
-from compressed_tensors.utils.safetensors_load import find_config_path
+from compressed_tensors.utils.safetensors_load import (
+    InverseWeightMap,
+    find_config_path,
+    load_tensors_from_inverse_weight_map,
+)
 from loguru import logger
-from safetensors.torch import load_file, save_file
+from safetensors.torch import save_file
 
 
 __all__ = [
@@ -34,17 +38,20 @@ def write_checkpoint_quantization_config(
     :param converter: Converter instance whose create_config() produces the
         updated quantization config
     """
-    quant_config = converter.create_config()
-
-    quant_config_data = quant_config.model_dump()
-    quant_config_data[COMPRESSION_VERSION_NAME] = ct_version
+    quant_config_data = None
+    if (quant_config := converter.create_config()) is not None:
+        quant_config_data = quant_config.model_dump()
+        quant_config_data[COMPRESSION_VERSION_NAME] = ct_version
 
     config_file_path = find_config_path(save_directory)
     if config_file_path is not None:
         with open(config_file_path, "r") as file:
             config_data = json.load(file)
 
-        config_data[QUANTIZATION_CONFIG_NAME] = quant_config_data
+        if quant_config_data is None:
-        if quant_config_data is None:
+        if quant_config_data is None and QUANTIZATION_CONFIG_NAME in config_data:
-        if quant_config_data is None:
+        if quant_config_data is None and QUANTIZATION_CONFIG_NAME in config_data:
+            del config_data[QUANTIZATION_CONFIG_NAME]
+        else:
+            config_data[QUANTIZATION_CONFIG_NAME] = quant_config_data
 
         with open(config_file_path, "w") as file:
             json.dump(config_data, file, indent=2, sort_keys=True)
@@ -57,35 +64,45 @@ def write_checkpoint_quantization_config(
 
 
 def validate_file(
-    file_path: str | os.PathLike,
+    inverse_weight_map: InverseWeightMap,
     converter: Converter,
 ):
     """
     Validate that each quantizable tensor in a safetensors file can be quantized.
 
-    :param file_path: safetensors file to validate
+    :param inverse_weight_map: mapping of resolved source file path ->
+        list of tensor names to load from that file. Precomputed by
+        build_inverse_weight_map() in the job-building phase.
+        Example: {"/path/shard0.safetensors": ["q_proj.weight"],
+                  "/path/shard1.safetensors": ["k_proj.weight", "v_proj.weight"]}
     :param converter: converter we wish to apply to the checkpoint,
         e.g. conversion of some layers from some format to compressed-tensors
     """
-    tensors = load_file(file_path)
+    tensors = load_tensors_from_inverse_weight_map(inverse_weight_map)
 
     converter.validate(tensors)
 
 
 def convert_file(
-    file_path: str | os.PathLike,
+    inverse_weight_map: InverseWeightMap,
     save_path: str | os.PathLike,
     converter: Converter,
 ) -> tuple[int, dict[str, str]]:
     """
     Convert tensors in a given safetensors file
 
-    :param file_path: safetensors file to process
+    :param inverse_weight_map: mapping of resolved source file path ->
+        list of tensor names to load from that file. Precomputed by
+        build_inverse_weight_map() in the job-building phase.
+        Example: {"/path/shard0.safetensors": ["q_proj.weight"],
+                  "/path/shard1.safetensors": ["k_proj.weight", "v_proj.weight"]}
     :param save_path: save path of file with quantized weights
     :param converter: converter we wish to apply to the checkpoint,
         e.g. conversion of some layers from some format to compressed-tensors
+    :returns: tuple of (total_size, weight_map), respectively the total size in bytes
+        of the saved file and dictionary of weight name -> save path
     """
-    tensors = load_file(file_path)
+    tensors = load_tensors_from_inverse_weight_map(inverse_weight_map)
 
     converter.process(tensors)
 

diff --git a/src/compressed_tensors/entrypoints/convert/converters/__init__.py b/src/compressed_tensors/entrypoints/convert/converters/__init__.py
@@ -6,3 +6,4 @@
 
 from .base import *
 from .modelopt_nvfp4 import *
+from .fp8block_dequantizer import *
diff --git a/src/compressed_tensors/entrypoints/convert/converters/base.py b/src/compressed_tensors/entrypoints/convert/converters/base.py
@@ -3,11 +3,15 @@
 
 from __future__ import annotations
 
+from collections import defaultdict
 from typing import TYPE_CHECKING, Protocol
 
 import torch
+from compressed_tensors.utils.safetensors_load import InverseWeightMap
 
 
+__all__ = ["Converter", "build_inverse_weight_maps"]
+
 if TYPE_CHECKING:
     from compressed_tensors.quantization import QuantizationConfig
 
@@ -42,9 +46,100 @@ def validate(self, tensors: dict[str, torch.Tensor]):
         """
         pass
 
-    def create_config(self) -> QuantizationConfig:
+    def create_config(self) -> QuantizationConfig | None:
         """
         Create compressed-tensors QuantizationConfig so that it can be set in the
         new model checkpoint's config.json.
+        If the converter is moving checkpoint to full-precision, have this function
+        return None, and quantization_config will be removed from config.json
-        If the converter is moving checkpoint to full-precision, have this function
-        return None, and quantization_config will be removed from config.json
+        A return value of `None` means that quantization_config will be removed from config.json
-        If the converter is moving checkpoint to full-precision, have this function
-        return None, and quantization_config will be removed from config.json
+        A return value of `None` means that quantization_config will be removed from config.json
+        """
+        pass
-        pass
+        raise NotImplementedError()
-        pass
+        raise NotImplementedError()
+
+    def get_dependencies(self, weight_name: str) -> dict[str, bool]:
+        """
+        Given a weight name, return a dictionary of all dependency weight names, so that
+        weights can be processed correctly and in a parallelized fashion.
+        If a dependency is optional, the value associated with the key should be False.
+        If the value is True, it is assumed the weight is required and will error out
+        during the job build phase if not found.
+        If there are no dependencies, an empty dict should be returned.
+
+        :returns: dict[str, bool] {dependency weight name -> whether it is required}
         """
         pass
+
+
+def build_inverse_weight_maps(
+    weight_map: dict[str, str],
+    model_files: dict[str, str],
+    converters: list[Converter],
+) -> dict[str, InverseWeightMap]:
+    """
+    For a given output shard, precompute exactly which tensors to load from
+    which source files — including required partner tensors from other shards.
+
+    This is necessary because some converters require that a set of tensors are
+    accessible in order for them to be processed correctly.
+
+    :param shard_name: the shard filename this job will process and save
+    :param weight_map: tensor name -> shard filename (from safetensors.index.json)
+    :param model_files: shard filename -> resolved absolute path
+    :return: {resolved_file_path: [tensor_names_to_load]}
+    """
+
+    def get_dependencies_recursive(
+        weight_name: str, converters: list[Converter], current_deps: dict[str, bool]
+    ) -> dict[str, bool]:
+        for converter in converters:
+            for dep, is_required in converter.get_dependencies(weight_name).items():
+                if dep not in current_deps:
+                    current_deps[dep] = is_required
+                    get_dependencies_recursive(dep, converters, current_deps)
+
+        return current_deps
+
+    # map of weight name -> ( map of dependency name -> is_required )
+    weight_deps_dict: dict[str, set[str]] = defaultdict(set)
-    weight_deps_dict: dict[str, set[str]] = defaultdict(set)
+    weight_deps_dict: dict[str, dict[str, bool]] = dict()
-    weight_deps_dict: dict[str, set[str]] = defaultdict(set)
+    weight_deps_dict: dict[str, dict[str, bool]] = dict()
+    for weight_name, weight_shard_name in weight_map.items():
+        weight_deps_dict[weight_name] = get_dependencies_recursive(
+            weight_name, converters, {}
+        )
+        assert (
+            weight_name not in weight_deps_dict[weight_name]
+        ), f"{weight_name} found in dependencies {weight_deps_dict[weight_name]}"
+
+    # set of all dependencies (i.e. all weight names required by another)
+    all_dependencies: set[str] = set()
+    for values in weight_deps_dict.values():
+        for value in values:
+            all_dependencies.add(value)
-    all_dependencies: set[str] = set()
-    for values in weight_deps_dict.values():
-        for value in values:
-            all_dependencies.add(value)
+    all_dependencies: set[str] = set().union(*weight_deps_dict.values())
-    all_dependencies: set[str] = set()
-    for values in weight_deps_dict.values():
-        for value in values:
-            all_dependencies.add(value)
+    all_dependencies: set[str] = set().union(*weight_deps_dict.values())
+
+    inverse_weight_maps: dict[str, InverseWeightMap] = defaultdict(
+        lambda: defaultdict(list)
+    )
+    for weight_name, weight_shard_name in weight_map.items():
+        if weight_name in all_dependencies:
+            # weight is a partner to some other primary tensor, skip it
+            continue
+
+        # weight is purely a primary weight, is not a dependency of anything
+        # add it and all its dependencies
+        inverse_weight_map: InverseWeightMap = inverse_weight_maps[weight_shard_name]
+        dependency_weights = weight_deps_dict[weight_name]
+        for weight_to_add_name, is_required in [
+            (weight_name, True),
+            *dependency_weights.items(),
+        ]:
+            if weight_to_add_name not in weight_map:
+                if is_required:
+                    raise ValueError(
+                        f"Required weight {weight_to_add_name} not found in weight map"
+                    )
+                else:
+                    continue
+            weight_to_add_shard_name = weight_map[weight_to_add_name]
+            resolved_path = model_files.get(weight_to_add_shard_name)
-            resolved_path = model_files.get(weight_to_add_shard_name)
+            resolved_path = model_files[weight_to_add_shard_name]
-            resolved_path = model_files.get(weight_to_add_shard_name)
+            resolved_path = model_files[weight_to_add_shard_name]
+            inverse_weight_map[resolved_path].append(weight_to_add_name)
+
+    # return dicts, not defaultdicts, to avoid silent errors
+    return {k: dict(v) for k, v in inverse_weight_maps.items()}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,3 +6,4 @@

		from .base import *
		from .modelopt_nvfp4 import *
		from .fp8block_dequantizer import *