Skip to content

Commit 3da59a0

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/transform_save
2 parents c6abb96 + b163bd9 commit 3da59a0

File tree

17 files changed

+236
-95
lines changed

17 files changed

+236
-95
lines changed

.github/actions/test/action.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ inputs:
77
suitename:
88
description: "test suite name"
99
required: true
10+
code_coverage:
11+
description: whether to collect code coverage metrics during test run
12+
type: boolean
13+
default: false
1014
outputs:
1115
status:
1216
description: "final status from test"
@@ -44,9 +48,37 @@ runs:
4448
run: |
4549
source ${{ inputs.venv }}/bin/activate
4650
rm -rf src
51+
52+
if [[ "${ENABLE_COVERAGE}" == "true" ]]; then
53+
echo "::group::Installing code coverage requirements via pip"
54+
pip install bashlex https://github.com/neuralmagic/pytest-nm-releng/archive/v0.4.0.tar.gz
55+
pip install coverage pytest-cov
56+
57+
# Adding Code coverage to the tests
58+
nmre-generate-coverage-flags --package "compressed_tensors" --output-file ".coverage_flags.sh"
59+
source .coverage_flags.sh
60+
echo "::endgroup::"
61+
fi
62+
63+
echo "::group::running tests"
64+
echo "PYTEST_ADDOPTS set to: ${PYTEST_ADDOPTS}"
65+
4766
SUCCESS=0
4867
pytest tests --junitxml=test-results/report.xml -o junit_suite_name="${{ inputs.suitename }}" || SUCCESS=$?
4968
echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
69+
echo "::endgroup::"
70+
71+
if [[ "${ENABLE_COVERAGE}" == "true" ]]; then
72+
echo "::group::consolidating coverage reports"
73+
mkdir -p coverage-results
74+
mv .coverage coverage-results/ || echo ".coverage file not found"
75+
mv coverage-html coverage-results/ || echo "coverage-html folder not found"
76+
mv coverage.json coverage-results/ || echo "coverage.json file not found"
77+
echo "::endgroup::"
78+
fi
79+
5080
deactivate
5181
exit ${SUCCESS}
5282
shell: bash
83+
env:
84+
ENABLE_COVERAGE: ${{ inputs.code_coverage || false }}

.github/workflows/test.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ on:
2525
run_id:
2626
description: run id of the BUILD job that generated the assets
2727
type: string
28+
code_coverage:
29+
description: whether to collect code coverage metrics during test run
30+
type: boolean
31+
default: false
2832

2933
# makes workflow manually callable
3034
workflow_dispatch:
@@ -51,6 +55,10 @@ on:
5155
run_id:
5256
description: run id of the BUILD job that generated the assets
5357
type: string
58+
code_coverage:
59+
description: whether to collect code coverage metrics during test run
60+
type: boolean
61+
default: false
5462

5563
jobs:
5664

@@ -124,6 +132,7 @@ jobs:
124132
with:
125133
venv: ${{ steps.create_venv.outputs.penv }}
126134
suitename: test-${{ inputs.python }}-${{ inputs.test_label }}
135+
code_coverage: ${{ inputs.code_coverage }}
127136

128137
- name: summary
129138
uses: neuralmagic/nm-actions/actions/[email protected]
@@ -146,3 +155,11 @@ jobs:
146155
name: report-${{ inputs.test_label }}.xml
147156
path: test-results/report.xml
148157
retention-days: 5
158+
159+
- name: upload coverage report
160+
uses: actions/upload-artifact@v4
161+
if: (success() || failure()) && inputs.code_coverage
162+
with:
163+
name: coverage-results
164+
path: coverage-results/*
165+
retention-days: 5

src/compressed_tensors/compressors/model_compressors/model_compressor.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,7 @@
4242
load_pretrained_quantization_parameters,
4343
)
4444
from compressed_tensors.quantization.lifecycle import expand_target_names
45-
from compressed_tensors.quantization.utils import (
46-
is_module_quantized,
47-
iter_named_leaf_modules,
48-
)
45+
from compressed_tensors.quantization.utils import is_module_quantized
4946
from compressed_tensors.utils import (
5047
align_module_device,
5148
delete_offload_parameter,
@@ -393,9 +390,16 @@ def compress_model(self, model: Module):
393390
)
394391

395392
for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):
393+
396394
if prefix in module_to_scheme or prefix in sparse_compression_targets:
395+
module_device = get_execution_device(module).type
396+
is_meta = (module_device == "meta")
397+
398+
exec_device = "meta" if is_meta else "cpu"
399+
onloading_device = "meta" if is_meta else module_device
400+
397401
# in the future, support compression on same device
398-
with align_module_device(module, execution_device="cpu"):
402+
with align_module_device(module, execution_device=exec_device):
399403
state_dict = module.state_dict(prefix=f"{prefix}.")
400404

401405
# quantization first
@@ -404,6 +408,7 @@ def compress_model(self, model: Module):
404408
state_dict,
405409
names_to_scheme=module_to_scheme,
406410
show_progress=False,
411+
compression_device=exec_device,
407412
)
408413

409414
# sparsity second
@@ -415,15 +420,14 @@ def compress_model(self, model: Module):
415420
)
416421

417422
# remove any existing parameters
418-
exec_device = get_execution_device(module)
419423
offload_device = get_offloaded_device(module)
420424
for name, _ in list(module.named_parameters()):
421425
delete_offload_parameter(module, name)
422426

423427
# replace with compressed parameters
424428
for name, value in state_dict.items():
425429
name = name.removeprefix(f"{prefix}.")
426-
value = value.to(exec_device)
430+
value = value.to(onloading_device)
427431
param = torch.nn.Parameter(value, requires_grad=False)
428432
register_offload_parameter(module, name, param, offload_device)
429433

@@ -747,7 +751,7 @@ def map_module_to_scheme(model: Module) -> Dict[str, QuantizationScheme]:
747751
"""
748752
return {
749753
fix_fsdp_module_name(name): module.quantization_scheme
750-
for name, module in iter_named_leaf_modules(model)
754+
for name, module in model.named_modules()
751755
if is_module_quantized(module)
752756
}
753757

src/compressed_tensors/compressors/quantized_compressors/base.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def compress(
7272
model_state: Dict[str, Tensor],
7373
names_to_scheme: Dict[str, QuantizationScheme],
7474
show_progress: bool = False,
75+
compression_device: str = "cpu",
7576
**kwargs,
7677
) -> Dict[str, Tensor]:
7778
"""
@@ -85,7 +86,6 @@ def compress(
8586
"""
8687
uncompressed_names = list(model_state.keys())
8788
compressed_dict = {}
88-
save_device = "cpu"
8989

9090
# compress values
9191
desc = "Compressing with quantization"
@@ -104,10 +104,10 @@ def compress(
104104

105105
# is scale does not exist, then weight cannot be compressed
106106
if scale is None:
107-
compressed_dict[name] = value.to(save_device)
107+
compressed_dict[name] = value.to(compression_device)
108108
continue
109109

110-
# compress values on cpu (memory movement too expensive)
110+
# compress values on meta if loading from meta otherwise on cpu (memory movement too expensive)
111111
module_path = prefix[:-1] if prefix.endswith(".") else prefix
112112
quant_args = names_to_scheme[module_path].weights
113113
compressed_values = self.compress_weight(
@@ -117,12 +117,12 @@ def compress(
117117
global_scale=global_scale,
118118
g_idx=g_idx,
119119
quantization_args=quant_args,
120-
device="cpu",
120+
device=compression_device,
121121
)
122122

123123
# update state dict
124124
for key, value in compressed_values.items():
125-
compressed_dict[prefix + key] = value.to(save_device)
125+
compressed_dict[prefix + key] = value.to(compression_device)
126126

127127
else:
128128
# omit saving zero points for symmetric or packed quantization
@@ -133,8 +133,7 @@ def compress(
133133
# TODO: does this case actually occur?
134134
elif name.endswith("g_idx") and torch.any(value <= -1):
135135
continue
136-
137-
compressed_dict[name] = value.to(save_device)
136+
compressed_dict[name] = value.to(compression_device)
138137

139138
return compressed_dict
140139

src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -220,30 +220,34 @@ def pack_to_int32(
220220
if num_bits < 1:
221221
raise ValueError(f"num_bits must be at least 1, got {num_bits}")
222222

223-
# convert to unsigned for packing
223+
# Convert to unsigned range for packing, matching quantization offset
224224
offset = 1 << (num_bits - 1)
225225
value = (value + offset).to(torch.uint8)
226-
value = value.cpu().numpy().astype(np.uint32)
226+
device = value.device
227+
227228
pack_factor = 32 // num_bits
228229

229-
# pad input tensor and initialize packed output
230-
packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
231-
padding = packed_size * pack_factor - value.shape[packed_dim]
232-
value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
230+
if packed_dim == 0:
231+
value = value.transpose(0, 1)
233232

234-
# pack values
235-
if packed_dim == 1:
236-
packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
237-
for i in range(pack_factor):
238-
packed |= value[:, i::pack_factor] << num_bits * i
239-
else:
240-
packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
241-
for i in range(pack_factor):
242-
packed |= value[i::pack_factor, :] << num_bits * i
233+
rows, cols = value.shape
234+
padded_cols = math.ceil(cols / pack_factor) * pack_factor
235+
pad_len = padded_cols - cols
236+
237+
if pad_len > 0:
238+
value = torch.nn.functional.pad(value, (0, pad_len))
239+
240+
num_groups = padded_cols // pack_factor
241+
242+
# Use int32 here
243+
reshaped = value.view(rows, num_groups, pack_factor).to(torch.int32)
244+
bit_shifts = torch.arange(pack_factor, device=device, dtype=torch.int32) * num_bits
245+
packed = (reshaped << bit_shifts).sum(dim=2, dtype=torch.int32)
246+
247+
if packed_dim == 0:
248+
packed = packed.transpose(0, 1)
243249

244-
# convert back to signed and torch
245-
packed = np.ascontiguousarray(packed).view(np.int32)
246-
return torch.from_numpy(packed)
250+
return packed
247251

248252

249253
def unpack_from_int32(

src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,10 @@ def compress_weight(self, name, value):
5656
bitmask_tensor = Sparse24BitMaskTensor.from_dense(
5757
value, self.config.sparsity_structure
5858
)
59-
bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu")
60-
return bitmask_dict
59+
return bitmask_tensor.dict(
60+
name_prefix=name,
61+
device="meta" if value.is_meta else "cpu",
62+
)
6163

6264
def decompress_weight(self, weight_data):
6365
data = Sparse24BitMaskTensor.from_compressed_data(**weight_data)
@@ -90,9 +92,14 @@ def from_dense(
9092
:return: instantiated compressed tensor
9193
"""
9294
shape = list(tensor.shape)
93-
compressed, bitmask = sparse24_bitmask_compress(
94-
tensor.cpu(), sparsity_structure=sparsity_structure
95-
)
95+
if tensor.is_meta:
96+
compressed, bitmask = sparse24_bitmask_compress(
97+
tensor, sparsity_structure=sparsity_structure
98+
)
99+
else:
100+
compressed, bitmask = sparse24_bitmask_compress(
101+
tensor.cpu(), sparsity_structure=sparsity_structure
102+
)
96103
return Sparse24BitMaskTensor(
97104
shape=shape,
98105
compressed=compressed,
@@ -169,6 +176,13 @@ def sparse24_bitmask_compress(
169176
SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
170177
), "Only 2:4 sparsity is supported"
171178

179+
if tensor.is_meta:
180+
num_rows, num_cols = tensor.shape
181+
compressed_values = torch.empty((num_rows, num_cols // 2), dtype=tensor.dtype, device="meta")
182+
packed_cols = (num_cols + 7) // 8
183+
bitmasks_packed = torch.empty((num_rows, packed_cols), dtype=torch.uint8, device="meta")
184+
return compressed_values, bitmasks_packed
185+
172186
bytemasks = get_24_bytemasks(tensor=tensor)
173187

174188
if tensor.dtype == FP8_DTYPE:

src/compressed_tensors/quantization/lifecycle/apply.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
KV_CACHE_TARGETS,
3939
infer_quantization_status,
4040
is_kv_cache_quant_scheme,
41-
iter_named_leaf_modules,
42-
iter_named_quantizable_modules,
4341
)
4442
from compressed_tensors.utils.helpers import fix_fsdp_module_name, replace_module
4543
from compressed_tensors.utils.offload import update_parameter_data
@@ -87,7 +85,7 @@ def load_pretrained_quantization_parameters(
8785
model_path = get_safetensors_folder(model_name_or_path)
8886
mapping = get_quantization_parameter_to_path_mapping(model_path)
8987

90-
for name, submodule in iter_named_leaf_modules(model):
88+
for name, submodule in model.named_modules():
9189
if not is_module_quantized(submodule):
9290
continue
9391
if submodule.quantization_scheme.input_activations is not None:
@@ -152,7 +150,7 @@ def apply_quantization_config(
152150
# list of submodules to ignore
153151
ignored_submodules = defaultdict(list)
154152
# mark appropriate layers for quantization by setting their quantization schemes
155-
for name, submodule in model.named_modules(): # child modules and attention modules
153+
for name, submodule in model.named_modules():
156154
# potentially fix module name to remove FSDP wrapper prefix
157155
name = fix_fsdp_module_name(name)
158156
if matches := find_name_or_class_matches(name, submodule, config.ignore):
@@ -283,7 +281,7 @@ def expand_target_names(
283281
"""
284282
return {
285283
name
286-
for name, module in iter_named_leaf_modules(model)
284+
for name, module in model.named_modules()
287285
if is_target(name, module, targets, ignore)
288286
}
289287

@@ -324,6 +322,11 @@ def find_name_or_class_matches(
324322
2. matches on regex patterns
325323
3. matches on module names
326324
"""
325+
from compressed_tensors import InternalModule
326+
327+
if isinstance(module, InternalModule):
328+
return []
329+
327330
targets = sorted(targets, key=lambda x: ("re:" in x, x))
328331
if isinstance(targets, Iterable):
329332
matches = _find_matches(name, targets) + _find_matches(

src/compressed_tensors/quantization/lifecycle/initialize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def _initialize_scale_zero_point(
189189
else:
190190
# TODO: consider erroring out in the future as if the dtype if not one of these,
191191
# there is likely bug
192-
if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
192+
if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32, torch.float64]:
193193
scale_dtype = torch.float16
194194
zp_dtype = quantization_args.pytorch_dtype()
195195

src/compressed_tensors/quantization/quant_config.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@
2222
preset_name_to_scheme,
2323
)
2424
from compressed_tensors.quantization.utils import (
25-
calculate_compression_ratio,
2625
is_module_quantized,
27-
iter_named_quantizable_modules,
2826
module_type,
2927
parse_out_kv_cache_args,
3028
)
@@ -177,9 +175,7 @@ def from_pretrained(
177175
quantization_status = None
178176
ignore = {}
179177
quantization_type_names = set()
180-
for name, submodule in iter_named_quantizable_modules(
181-
model, include_children=True, include_attn=True
182-
):
178+
for name, submodule in model.named_modules():
183179
layer_type = module_type(submodule)
184180
if not is_module_quantized(submodule):
185181
if layer_type not in ignore:

0 commit comments

Comments
 (0)