Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/qwen3_reranker_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
scores = outputs.logits[:, -1, :].max(dim=-1).values

for i, (doc, score) in enumerate(zip(documents, scores)):
print(f"Document {i+1} score: {score.item():.4f}")
print(f"Document {i + 1} score: {score.item():.4f}")
print(f" Content: {doc[:80]}...")

print("==========================================")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def tokenize(sample):
logger.info(f"Calibration: {NUM_CALIBRATION_SAMPLES} samples total")
logger.info(f"Samples/rank: {samples_per_rank}")
logger.info(f"World size: {world_size}")
logger.info(f"Total time: {elapsed:.1f}s ({elapsed/60:.2f} min)")
logger.info(f"Total time: {elapsed:.1f}s ({elapsed / 60:.2f} min)")
logger.info(f"Peak GPU mem: {peak_mem_gb:.2f} GB (rank 0)")
logger.info("=" * 60)

Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/core/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,9 @@ def event(
if data is not None:
mod_data.append(data)

assert (
event is not None
), f"Event lifecycle did not return an event for {event_type}"
assert event is not None, (
f"Event lifecycle did not return an event for {event_type}"
)

return mod_data

Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,9 +422,9 @@ def get_rank_partition(split: str, num_samples: int) -> str:
we give each device at least S//D samples and distribute
the remaining samples as evenly as possible across all devices
"""
assert (
"[" not in split
), "Split string should not already contain partitioning brackets"
assert "[" not in split, (
"Split string should not already contain partitioning brackets"
)

start, end = _get_partition_start_end(
num_samples, dist.get_rank(), dist.get_world_size()
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/modeling/gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ def forward(
This is compatible with the GPT-OSS MoE call pattern:
experts(hidden_states, router_indices, routing_weights)
"""
assert (
routing_weights is not None and router_indices is not None
), "router inputs required"
assert routing_weights is not None and router_indices is not None, (
"router inputs required"
)

# Normalize shapes to [tokens, H], [tokens, top_k], [tokens, E]
if hidden_states.dim() == 3:
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/modifiers/autoround/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,9 +492,9 @@ def _mapping_config_to_autoround(self):
)

for scheme in resolved_config.config_groups.values():
assert isinstance(
scheme, QuantizationScheme
), f"Expected QuantizationScheme, got {type(scheme)}"
assert isinstance(scheme, QuantizationScheme), (
f"Expected QuantizationScheme, got {type(scheme)}"
)
quant_scheme = scheme
weight_args = quant_scheme.weights
activation_args = quant_scheme.input_activations
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/modifiers/awq/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,9 +805,9 @@ def _compute_best_scale(
}
)

assert (
torch.isnan(best_scales).sum() == 0
), f"Nan found in scales: {best_scales}"
assert torch.isnan(best_scales).sum() == 0, (
f"Nan found in scales: {best_scales}"
)

return best_scales.detach().cpu()

Expand Down
24 changes: 2 additions & 22 deletions src/llmcompressor/pipelines/sequential/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from dataclasses import dataclass
from functools import wraps
from types import FunctionType, MethodType
from typing import TYPE_CHECKING, Any, Callable, Optional
from typing import TYPE_CHECKING, Any, Callable

import torch
from accelerate.hooks import remove_hook_from_module
from compressed_tensors.offload import disable_onloading, offload_model
from compressed_tensors.offload import disable_onloading
from compressed_tensors.utils import patch_attr
from compressed_tensors.utils.match import match_named_modules
from loguru import logger
Expand All @@ -22,7 +22,6 @@
from llmcompressor.modifiers import Modifier
from llmcompressor.modifiers.utils.hooks import HooksMixin
from llmcompressor.pipelines.sequential.transformers_helpers import HFTracer
from llmcompressor.utils.dev import get_main_device
from llmcompressor.utils.helpers import calibration_forward_context
from llmcompressor.utils.pytorch.module import get_no_split_params

Expand All @@ -35,7 +34,6 @@
"trace_subgraphs",
"Subgraph",
"get_sequential_targets",
"dispatch_for_sequential",
"handle_sequential_oom",
]

Expand Down Expand Up @@ -516,24 +514,6 @@ def is_ancestor(module: Module) -> bool:
return ancestors


def dispatch_for_sequential(
model: PreTrainedModel,
onload_device: Optional[torch.device | str] = None,
offload_device: Optional[torch.device | str] = None,
) -> PreTrainedModel:
"""
Dispatch a model for sequential calibration using a sequential pipeline.
The model will be offloaded to the CPU and dispatched to CUDA/XPU device
if available. Removes any existing hooks.

:param model: model to dispatch
:return: dispatched model
"""
if onload_device is None:
onload_device = get_main_device()
return offload_model(model, onload_device, offload_device)


def _get_autowrap_functions() -> tuple[Callable[[Any], Any], ...]:
try:
from transformers.masking_utils import LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING
Expand Down
4 changes: 2 additions & 2 deletions src/llmcompressor/pipelines/sequential/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import TYPE_CHECKING, Iterator

import torch
from compressed_tensors.offload import set_onload_device
from compressed_tensors.utils import disable_offloading
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm
Expand All @@ -11,7 +12,6 @@
from llmcompressor.pipelines.cache import IntermediatesCache
from llmcompressor.pipelines.registry import CalibrationPipeline
from llmcompressor.pipelines.sequential.helpers import (
dispatch_for_sequential,
get_sequential_targets,
handle_sequential_oom,
trace_subgraphs,
Expand Down Expand Up @@ -89,7 +89,7 @@ def __call__(
# prepare model for sequential onloading
onload_device = get_main_device()
offload_device = torch.device(dataset_args.sequential_offload_device)
dispatch_for_sequential(model, onload_device)
set_onload_device(model, onload_device)

# prepare to trace subgraphs
modifiers = session.lifecycle.recipe.modifiers
Expand Down
6 changes: 3 additions & 3 deletions tests/llmcompressor/datasets/test_length_aware_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@ def test_tokens_added_calculation(self):
LengthAwareSampler(dataset, batch_size=2)

debug_calls = [str(c) for c in mock_logger.debug.call_args_list]
assert any(
"added (padding): 150" in c for c in debug_calls
), f"Expected 'added (padding): 150' in {debug_calls}"
assert any("added (padding): 150" in c for c in debug_calls), (
f"Expected 'added (padding): 150' in {debug_calls}"
)
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,6 @@ def test_convert_checkpoint(tmp_path):
data = json.load(f)
keys = data["weight_map"].keys()
for key in keys:
assert any(
key.endswith(suffix) for suffix in allowed_suffixes
), f"Unexpected key found: {key}"
assert any(key.endswith(suffix) for suffix in allowed_suffixes), (
f"Unexpected key found: {key}"
)
6 changes: 3 additions & 3 deletions tests/llmcompressor/modeling/test_calib_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ def hook_fn(i, module, input, output):
_ = moe_layer(sample)

# Assert all experts are used
assert all(
expert_triggered
), f"Not all experts were triggered: {expert_triggered}"
assert all(expert_triggered), (
f"Not all experts were triggered: {expert_triggered}"
)


@requires_gpu
Expand Down
6 changes: 3 additions & 3 deletions tests/llmcompressor/modeling/test_calib_glm4_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ def hook_fn(i, module, input, output):
_ = moe_layer(sample)

# Assert all experts are used
assert all(
expert_triggered
), f"Not all experts were triggered: {expert_triggered}"
assert all(expert_triggered), (
f"Not all experts were triggered: {expert_triggered}"
)


@requires_gpu
Expand Down
6 changes: 3 additions & 3 deletions tests/llmcompressor/modeling/test_calib_llama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ def hook_fn(i, module, input, output):
_ = moe_layer(sample)

# Assert all experts are used
assert all(
expert_triggered
), f"Not all experts were triggered: {expert_triggered}"
assert all(expert_triggered), (
f"Not all experts were triggered: {expert_triggered}"
)


@requires_gpu
Expand Down
6 changes: 3 additions & 3 deletions tests/llmcompressor/modeling/test_calib_qwen3.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ def hook_fn(i, module, input, output):
_ = moe_layer(sample)

# Assert all experts are used
assert all(
expert_triggered
), f"Not all experts were triggered: {expert_triggered}"
assert all(expert_triggered), (
f"Not all experts were triggered: {expert_triggered}"
)


@requires_gpu
Expand Down
6 changes: 3 additions & 3 deletions tests/llmcompressor/modeling/test_calib_qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def hook_fn(i, module, input, output):
_ = moe_layer(sample)

# Assert all experts are used
assert all(
expert_triggered
), f"Not all experts were triggered: {expert_triggered}"
assert all(expert_triggered), (
f"Not all experts were triggered: {expert_triggered}"
)


@requires_gpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def test_logarithmic_equalization_is_registered():
mappings=mappings,
)

assert isinstance(
modifier, LogarithmicEqualizationModifier
), "PyTorch LogarithmicEqualizationModifier not registered"
assert isinstance(modifier, LogarithmicEqualizationModifier), (
"PyTorch LogarithmicEqualizationModifier not registered"
)
assert isinstance(modifier, SmoothQuantModifier)
assert modifier.smoothing_strength == smoothing_strength
assert modifier.mappings == mappings
6 changes: 3 additions & 3 deletions tests/llmcompressor/modifiers/pruning/sparsegpt/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ def test_sparse_gpt_is_registered():
targets=targets,
)

assert isinstance(
type_, SparseGPTModifier
), "PyTorch SparseGPTModifier not registered"
assert isinstance(type_, SparseGPTModifier), (
"PyTorch SparseGPTModifier not registered"
)
6 changes: 3 additions & 3 deletions tests/llmcompressor/modifiers/pruning/wanda/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ def test_wanda_is_registered():
targets=targets,
)

assert isinstance(
type_, WandaPruningModifier
), "PyTorch WandaPruningModifier not registered"
assert isinstance(type_, WandaPruningModifier), (
"PyTorch WandaPruningModifier not registered"
)
Loading
Loading