Skip to content
Merged
Show file tree
Hide file tree
Changes from 56 commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
bdc5e5d
example p1
brian-dellabetta Jan 13, 2026
cde1c3a
p2
brian-dellabetta Jan 13, 2026
06695a5
p2
brian-dellabetta Jan 14, 2026
a9a567f
use targets
brian-dellabetta Jan 15, 2026
264636a
update quant config
brian-dellabetta Jan 15, 2026
255f803
comments
brian-dellabetta Jan 15, 2026
02bf5ee
script cleanup
brian-dellabetta Jan 15, 2026
22a4758
minor cleanup
brian-dellabetta Jan 16, 2026
bfe4e5c
ignore default values
brian-dellabetta Jan 16, 2026
e713d4b
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Jan 16, 2026
b6c9807
stylefixes
brian-dellabetta Jan 16, 2026
a4d4ad9
invert global input/weight scales
brian-dellabetta Jan 16, 2026
5ee4758
fix
brian-dellabetta Jan 18, 2026
64944e0
updates
brian-dellabetta Jan 21, 2026
9f89d29
missing format
brian-dellabetta Jan 22, 2026
d79e0b9
minor touchups
brian-dellabetta Jan 22, 2026
e0e8ccb
comment typo
brian-dellabetta Jan 23, 2026
302330e
merge main
brian-dellabetta Feb 23, 2026
8339433
Processor protocol
brian-dellabetta Feb 23, 2026
2c1f5d2
cleanup
brian-dellabetta Feb 23, 2026
f3e33a5
cleanup
brian-dellabetta Feb 23, 2026
c9c023a
cleanup
brian-dellabetta Feb 24, 2026
0adf115
helper cleanup
brian-dellabetta Feb 24, 2026
7f7663c
bugfix
brian-dellabetta Feb 25, 2026
a54d4cb
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Feb 25, 2026
c49f401
fix logic, match_quantizable_tensors
brian-dellabetta Feb 25, 2026
49683b6
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Feb 25, 2026
3b667fc
target regex update
brian-dellabetta Feb 27, 2026
5fc016f
refactor to CT entrypoint
brian-dellabetta Mar 2, 2026
179b70a
update create config
brian-dellabetta Mar 2, 2026
69e9a4a
minor cleanup
brian-dellabetta Mar 2, 2026
692bd13
fix overwrite qconfig
brian-dellabetta Mar 2, 2026
2f882ef
revert example
brian-dellabetta Mar 2, 2026
869d85d
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Mar 2, 2026
4b47725
refactor from CT changes
brian-dellabetta Mar 3, 2026
3cb89dd
cleanup
brian-dellabetta Mar 3, 2026
0ee7d9b
cleanup
brian-dellabetta Mar 3, 2026
6120b26
post-refactor cleanup
brian-dellabetta Mar 3, 2026
0663bd0
test cosmetics
brian-dellabetta Mar 3, 2026
39f9442
docstrings
brian-dellabetta Mar 3, 2026
be73088
docstring
brian-dellabetta Mar 3, 2026
7e241d0
minor refactor, exec_jobs
brian-dellabetta Mar 4, 2026
a5a1b43
prune find_safetensors_index_file
brian-dellabetta Mar 5, 2026
f4bb2d9
bugfix
brian-dellabetta Mar 5, 2026
6fb2fb6
typo
brian-dellabetta Mar 5, 2026
43b2c36
move simlar named helper to private
brian-dellabetta Mar 5, 2026
75e6478
prune helper
brian-dellabetta Mar 5, 2026
7f4cd5e
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Mar 5, 2026
c59baa3
move entrypoints tests to dedicated folder
brian-dellabetta Mar 5, 2026
2041946
move model free validate
brian-dellabetta Mar 5, 2026
a556514
entrypoints tests
brian-dellabetta Mar 5, 2026
b9ce613
cleanup
brian-dellabetta Mar 5, 2026
e025a5f
cleanup
brian-dellabetta Mar 5, 2026
5ae4c63
rename example
brian-dellabetta Mar 5, 2026
96daf7c
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Mar 5, 2026
2b1c26d
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Mar 6, 2026
d7cba48
reindex_fused_weights
brian-dellabetta Mar 6, 2026
07ccbbe
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
brian-dellabetta Mar 9, 2026
dd0be8e
test_calib_deepseekv3_module consistency fix
brian-dellabetta Mar 10, 2026
f520382
Merge branch 'main' into bdellabe/example-dsr1-nvfp4-fp8block
dsikka Mar 10, 2026
1c6874f
failing test fix
brian-dellabetta Mar 10, 2026
1ed8a9d
add not isnan assertion
brian-dellabetta Mar 10, 2026
9c0a8dc
cicd test fix
brian-dellabetta Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions examples/model_free_ptq/deepseek_r1_nvfp4_fp8_block.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from compressed_tensors.entrypoints.convert import (
ModelOptNvfp4Converter,
)
from compressed_tensors.quantization import (
QuantizationScheme,
)
from compressed_tensors.quantization.quant_scheme import FP8_BLOCK

from llmcompressor import model_free_ptq

MODEL_ID = "nvidia/DeepSeek-R1-NVFP4"
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-BLOCK"


# Convert modelopt NVFP4 format to compressed-tensors format and
# apply FP8-Block to the model's compatible self_attn Linear layers
# Once quantized, the model is saved to SAVE_DIR.
model_free_ptq(
model_stub=MODEL_ID,
save_directory=SAVE_DIR,
scheme=QuantizationScheme(
**FP8_BLOCK,
targets=[
# Target fused layers, must have the same quant config
# shape 576x7168 is compatible with block size 128x128
# - self_attn.kv_a_proj_with_mqa
# - self_attn.q_a_proj
"re:.*self_attn.(kv_a_proj_with_mqa|q_a_proj)$",
# Skip self_attn.kv_b_proj, already dequantized by MLA
# Target remaining self_attn layers:
# - self_attn.o_proj
# - self_attn.q_b_proj
"re:.*self_attn.(o_proj|q_b_proj).*",
],
),
max_workers=8,
device="cuda:0",
converter=ModelOptNvfp4Converter(
targets=[
# nvidia/DeepSeek-R1-NVFP4's nvfp4-quantized layers, found by inspection
# - model.layers.0.mlp.down_proj.weight
# - model.layers.0.mlp.gate_proj.weight
# - model.layers.0.mlp.up_proj.weight
# - model.layers.3.mlp.shared_experts.down_proj.weight
# - model.layers.3.mlp.shared_experts.gate_proj.weight
# - model.layers.3.mlp.shared_experts.up_proj.weight
# - model.layers.3.mlp.experts.0.down_proj.weight
# - model.layers.3.mlp.experts.0.gate_proj.weight
# - model.layers.3.mlp.experts.0.up_proj.weight
# NOTE: gate_up_proj also needs to be targeted, gate/up are fused
"re:.*mlp.*(gate_up|gate|up|down)_proj$"
]
),
)
53 changes: 27 additions & 26 deletions src/llmcompressor/entrypoints/model_free/__init__.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Iterable, Optional

import torch
import tqdm
from compressed_tensors.entrypoints.convert import (
Converter,
exec_jobs,
get_checkpoint_files,
is_weights_file,
update_safetensors_index,
)
from compressed_tensors.quantization import QuantizationScheme
from loguru import logger

from llmcompressor.entrypoints.model_free.helpers import gpu_if_available
from llmcompressor.entrypoints.model_free.microscale import (
is_microscale_scheme,
)
from llmcompressor.entrypoints.model_free.model_utils import (
get_checkpoint_files,
is_weights_file,
)
from llmcompressor.entrypoints.model_free.process import (
process_file,
process_file_microscale_scheme,
validate_file,
)
from llmcompressor.entrypoints.model_free.save_utils import (
update_config,
update_safetensors_index,
)
from llmcompressor.entrypoints.model_free.validate import (
validate_safetensors_index,
Expand All @@ -41,6 +41,7 @@ def model_free_ptq(
ignore: Iterable[str] = tuple(),
max_workers: int = 1,
device: Optional[torch.device | str] = None,
converter: Converter | None = None,
):
"""
Quantize a model without the need for a model definition. This function operates on
Expand All @@ -52,6 +53,10 @@ def model_free_ptq(
ignored
:param max_workers: number of worker threads to process files with
:param device: gpu device to accelerate quantization with
:param converter: optional converter to apply to the checkpoint to convert it to
compressed-tensors format before running model-free PTQ
e.g. conversion of some layers from modelopt format to compressed-tensors
See compressed-tensors convert_checkpoint entrypoint for more information
"""
# validate arguments
model_files = get_checkpoint_files(model_stub)
Expand All @@ -70,7 +75,9 @@ def model_free_ptq(
save_path = Path(save_directory) / file_path

if file_path.endswith("safetensors"):
jobs.append((job_fn, resolved_path, save_path, scheme, ignore, device))
jobs.append(
(job_fn, resolved_path, save_path, scheme, ignore, device, converter)
)

else:
if is_weights_file(file_path):
Expand All @@ -79,25 +86,19 @@ def model_free_ptq(
logger.info(f"Copying {file_path} {save_path}")
shutil.copyfile(resolved_path, save_path)

with ThreadPoolExecutor(max_workers) as executor:
# 1. validate quantizable tensors fail fast before long-running quantization
futures = [executor.submit(validate_file, *job[1:]) for job in jobs]
for future in tqdm.tqdm(
as_completed(futures), total=len(futures), desc="Validating"
):
future.result()
# 1. validate quantizable tensors fail fast before long-running quantization
exec_jobs(
[(validate_file, *job[1:]) for job in jobs], max_workers, desc="Validating"
)

# 2-5. quantize and compress weights
total_size = 0
weight_map = dict()
futures = [executor.submit(*job) for job in jobs]
for future in tqdm.tqdm(
as_completed(futures), total=len(futures), desc="Quantizing"
):
_total_size, _weight_map = future.result()
total_size += _total_size
weight_map.update(_weight_map)
# 2-5. quantize and compress weights
total_size = 0
weight_map = dict()
quantize_results = exec_jobs(jobs, max_workers, desc="Quantizing")
for _total_size, _weight_map in quantize_results:
total_size += _total_size
weight_map.update(_weight_map)

# 5. update config and safetensors index
update_config(save_directory, scheme_name, scheme, ignore)
update_config(save_directory, scheme_name, scheme, ignore, converter)
update_safetensors_index(save_directory, total_size, weight_map)
20 changes: 0 additions & 20 deletions src/llmcompressor/entrypoints/model_free/helpers.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import os
import re
from collections import defaultdict
from typing import Mapping, TypeVar

import torch
from compressed_tensors.utils.match import match_name
from loguru import logger
from transformers.file_utils import CONFIG_NAME

__all__ = [
"gpu_if_available",
"find_safetensors_index_path",
"find_config_path",
"find_safetensors_index_file",
"match_names_set_eager",
"MatchedNamesSet",
Expand Down Expand Up @@ -43,22 +39,6 @@ def gpu_if_available(device: torch.device | str | None) -> torch.device:
return torch.device("cpu")


def find_safetensors_index_path(save_directory: str | os.PathLike) -> str | None:
for file_name in os.listdir(save_directory):
if file_name.endswith("safetensors.index.json"):
return os.path.join(save_directory, file_name)

return None


def find_config_path(save_directory: str | os.PathLike) -> str | None:
for file_name in os.listdir(save_directory):
if file_name in (CONFIG_NAME, "params.json"):
return os.path.join(save_directory, file_name)

return None


def find_safetensors_index_file(model_files: dict[str, str]) -> str | None:
for file_path, resolved_path in model_files.items():
if file_path.endswith("safetensors.index.json"):
Expand Down
48 changes: 0 additions & 48 deletions src/llmcompressor/entrypoints/model_free/model_utils.py

This file was deleted.

49 changes: 29 additions & 20 deletions src/llmcompressor/entrypoints/model_free/process.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os
from collections import defaultdict
from collections.abc import Iterator, Mapping
from typing import Iterable

import torch
from compressed_tensors.entrypoints.convert import Converter
from compressed_tensors.quantization import QuantizationScheme
from compressed_tensors.utils.match import match_name
from compressed_tensors.utils import match_quantizable_tensors
from safetensors.torch import load_file, save_file
from torch.nn import Module

Expand All @@ -21,21 +21,11 @@
is_microscale_scheme,
)

__all__ = ["validate_file", "process_file", "process_file_microscale_scheme"]


def iter_quantizable_tensors(
tensors: Mapping[str, torch.Tensor],
ignore: Iterable[str],
) -> Iterator[tuple[str, str]]:
for name in list(tensors.keys()):
module_name, param_name = name.rsplit(".", 1)
is_linear_weight = param_name == "weight" and not module_name.endswith("norm")
is_ignored = any(match_name(module_name, ign) for ign in ignore)
if not is_linear_weight or is_ignored:
continue

yield module_name, name
__all__ = [
"validate_file",
"process_file",
"process_file_microscale_scheme",
]


def validate_file(
Expand All @@ -44,6 +34,7 @@ def validate_file(
scheme: QuantizationScheme,
ignore: Iterable[str],
device: str | torch.device,
converter: Converter | None = None,
):
"""
Validate that each quantizable tensor in a safetensors file can be quantized.
Expand All @@ -52,10 +43,15 @@ def validate_file(
:param scheme: quantization scheme to apply to tensors
:param ignore: modules to ignore. Modules ending with "norm" are automatically
ignored
:param converter: optional converter to apply to the checkpoint,
e.g. conversion of some layers from some format to compressed-tensors
"""
tensors = load_file(file_path)

for _, name in iter_quantizable_tensors(tensors, ignore):
if converter is not None:
converter.validate(tensors)

for _, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
validate_weight_for_quantization(tensors[name], scheme, name)


Expand All @@ -65,6 +61,7 @@ def process_file(
scheme: QuantizationScheme,
ignore: Iterable[str],
device: str | torch.device,
converter: Converter | None = None,
) -> tuple[int, dict[str, str]]:
"""
Quantize and compress tensors in a given safetensors file
Expand All @@ -75,11 +72,16 @@ def process_file(
:param ignore: modules to ignore. Modules ending with "norm" are automatically
ignored
:param device: device used to quantize and compress weights
:param converter: optional converter to apply to the checkpoint,
e.g. conversion of some layers from some format to compressed-tensors
"""
assert not is_microscale_scheme(scheme), "Use `_process_file_microscale_scheme`"
tensors = load_file(file_path)

for module_name, name in iter_quantizable_tensors(tensors, ignore):
if converter is not None:
converter.process(tensors)

for module_name, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
validate_weight_for_quantization(tensors[name], scheme, name)

# 1. initialize module with qparams (on device)
Expand Down Expand Up @@ -109,6 +111,7 @@ def process_file_microscale_scheme(
scheme: QuantizationScheme,
ignore: Iterable[str],
device: str | torch.device,
converter: Converter | None = None,
) -> tuple[int, dict[str, str]]:
"""
Quantize and compress tensors in a given safetensors file
Expand All @@ -119,9 +122,15 @@ def process_file_microscale_scheme(
:param ignore: modules to ignore. Modules ending with "norm" are automatically
ignored
:param device: device used to quantize and compress weights
:param converter: optional converter to apply to the checkpoint,
e.g. conversion of some layers from some format to compressed-tensors
"""
assert is_microscale_scheme(scheme), "Use `_process_file` for non-microscale scheme"
tensors = load_file(file_path)

if converter is not None:
converter.process(tensors)

fused_sets, unmatched_sets = get_fused_names(tensors)
assert len(unmatched_sets) <= 0 # should be caught by `validate_safetensors_index`

Expand All @@ -135,7 +144,7 @@ def process_file_microscale_scheme(
}
fused_modules = defaultdict(dict)

for module_name, name in iter_quantizable_tensors(tensors, ignore):
for module_name, name in match_quantizable_tensors(tensors, ignore, scheme.targets):
validate_weight_for_quantization(tensors[name], scheme, name)

# 1. initialize module with qparams (on device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@

import torch
import tqdm
from compressed_tensors.entrypoints.convert import (
get_checkpoint_files,
is_weights_file,
)
from loguru import logger
from safetensors.torch import load_file, save_file

Expand All @@ -15,10 +19,6 @@
invert_mapping,
)
from llmcompressor.entrypoints.model_free.microscale import get_fused_names
from llmcompressor.entrypoints.model_free.model_utils import (
get_checkpoint_files,
is_weights_file,
)
from llmcompressor.entrypoints.model_free.save_utils import update_safetensors_index


Expand Down
Loading
Loading