neuralmagic
diff --git a/‎.github/actions/test/action.yml‎
Lines changed: 32 additions & 0 deletions b/‎.github/actions/test/action.yml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/test.yml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 12 additions & 8 deletions b/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/base.py‎
Lines changed: 6 additions & 7 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/base.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py‎
Lines changed: 22 additions & 18 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py‎
Lines changed: 22 additions & 18 deletions
diff --git a/‎src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py‎
Lines changed: 19 additions & 5 deletions b/‎src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/apply.py‎
Lines changed: 8 additions & 5 deletions b/‎src/compressed_tensors/quantization/lifecycle/apply.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 1 addition & 1 deletion b/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/compressed_tensors/quantization/quant_config.py‎
Lines changed: 1 addition & 5 deletions b/‎src/compressed_tensors/quantization/quant_config.py‎
Lines changed: 1 addition & 5 deletions
@@ -7,6 +7,10 @@ inputs:
   suitename:
     description: "test suite name"
     required: true
+  code_coverage:
+    description: whether to collect code coverage metrics during test run
+    type: boolean
+    default: false
 outputs:
   status:
     description: "final status from test"
@@ -44,9 +48,37 @@ runs:
       run: |
           source ${{ inputs.venv }}/bin/activate
           rm -rf src
+
+          if [[ "${ENABLE_COVERAGE}" == "true" ]]; then
+            echo "::group::Installing code coverage requirements via pip"
+            pip install bashlex https://github.com/neuralmagic/pytest-nm-releng/archive/v0.4.0.tar.gz
+            pip install coverage pytest-cov
+
+            # Adding Code coverage to the tests
+            nmre-generate-coverage-flags --package "compressed_tensors" --output-file ".coverage_flags.sh"
+            source .coverage_flags.sh
+            echo "::endgroup::"
+          fi
+
+          echo "::group::running tests"
+          echo "PYTEST_ADDOPTS set to: ${PYTEST_ADDOPTS}"
+
           SUCCESS=0
           pytest tests --junitxml=test-results/report.xml -o junit_suite_name="${{ inputs.suitename }}" || SUCCESS=$?
           echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
+          echo "::endgroup::"
+
+          if [[ "${ENABLE_COVERAGE}" == "true" ]]; then
+            echo "::group::consolidating coverage reports"
+            mkdir -p coverage-results
+            mv .coverage coverage-results/ || echo ".coverage file not found"
+            mv coverage-html coverage-results/ || echo "coverage-html folder not found"
+            mv coverage.json coverage-results/ || echo "coverage.json file not found"
+            echo "::endgroup::"
+          fi
+          
           deactivate
           exit ${SUCCESS}
       shell: bash
+      env:
+        ENABLE_COVERAGE: ${{ inputs.code_coverage || false }}
@@ -25,6 +25,10 @@ on:
       run_id:
         description: run id of the BUILD job that generated the assets
         type: string
+      code_coverage:
+        description: whether to collect code coverage metrics during test run
+        type: boolean
+        default: false
 
   # makes workflow manually callable
   workflow_dispatch:
@@ -51,6 +55,10 @@ on:
       run_id:
         description: run id of the BUILD job that generated the assets
         type: string
+      code_coverage:
+        description: whether to collect code coverage metrics during test run
+        type: boolean
+        default: false
 
 jobs:
 
@@ -124,6 +132,7 @@ jobs:
               with:
                   venv: ${{ steps.create_venv.outputs.penv }}
                   suitename: test-${{ inputs.python }}-${{ inputs.test_label }}
+                  code_coverage: ${{ inputs.code_coverage }}
 
             - name: summary
               uses: neuralmagic/nm-actions/actions/[email protected]
@@ -146,3 +155,11 @@ jobs:
                   name: report-${{ inputs.test_label }}.xml
                   path: test-results/report.xml
                   retention-days: 5
+
+            - name: upload coverage report
+              uses: actions/upload-artifact@v4
+              if: (success() || failure()) && inputs.code_coverage
+              with:
+                  name: coverage-results
+                  path: coverage-results/*
+                  retention-days: 5
@@ -42,10 +42,7 @@
     load_pretrained_quantization_parameters,
 )
 from compressed_tensors.quantization.lifecycle import expand_target_names
-from compressed_tensors.quantization.utils import (
-    is_module_quantized,
-    iter_named_leaf_modules,
-)
+from compressed_tensors.quantization.utils import is_module_quantized
 from compressed_tensors.utils import (
     align_module_device,
     delete_offload_parameter,
@@ -393,9 +390,16 @@ def compress_model(self, model: Module):
         )
 
         for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):
+
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
+                module_device = get_execution_device(module).type
+                is_meta = (module_device == "meta")
+
+                exec_device = "meta" if is_meta else "cpu"
+                onloading_device = "meta" if is_meta else module_device
+
                 # in the future, support compression on same device
-                with align_module_device(module, execution_device="cpu"):
+                with align_module_device(module, execution_device=exec_device):
                     state_dict = module.state_dict(prefix=f"{prefix}.")
 
                 # quantization first
@@ -404,6 +408,7 @@ def compress_model(self, model: Module):
                         state_dict,
                         names_to_scheme=module_to_scheme,
                         show_progress=False,
+                        compression_device=exec_device,
                     )
 
                 # sparsity second
@@ -415,15 +420,14 @@ def compress_model(self, model: Module):
                     )
 
                 # remove any existing parameters
-                exec_device = get_execution_device(module)
                 offload_device = get_offloaded_device(module)
                 for name, _ in list(module.named_parameters()):
                     delete_offload_parameter(module, name)
 
                 # replace with compressed parameters
                 for name, value in state_dict.items():
                     name = name.removeprefix(f"{prefix}.")
-                    value = value.to(exec_device)
+                    value = value.to(onloading_device)
                     param = torch.nn.Parameter(value, requires_grad=False)
                     register_offload_parameter(module, name, param, offload_device)
 
@@ -747,7 +751,7 @@ def map_module_to_scheme(model: Module) -> Dict[str, QuantizationScheme]:
     """
     return {
         fix_fsdp_module_name(name): module.quantization_scheme
-        for name, module in iter_named_leaf_modules(model)
+        for name, module in model.named_modules()
         if is_module_quantized(module)
     }
 
 
@@ -72,6 +72,7 @@ def compress(
         model_state: Dict[str, Tensor],
         names_to_scheme: Dict[str, QuantizationScheme],
         show_progress: bool = False,
+        compression_device: str = "cpu",
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
@@ -85,7 +86,6 @@ def compress(
         """
         uncompressed_names = list(model_state.keys())
         compressed_dict = {}
-        save_device = "cpu"
 
         # compress values
         desc = "Compressing with quantization"
@@ -104,10 +104,10 @@ def compress(
 
                 # is scale does not exist, then weight cannot be compressed
                 if scale is None:
-                    compressed_dict[name] = value.to(save_device)
+                    compressed_dict[name] = value.to(compression_device)
                     continue
 
-                # compress values on cpu (memory movement too expensive)
+                # compress values on meta if loading from meta otherwise on cpu (memory movement too expensive)
                 module_path = prefix[:-1] if prefix.endswith(".") else prefix
                 quant_args = names_to_scheme[module_path].weights
                 compressed_values = self.compress_weight(
@@ -117,12 +117,12 @@ def compress(
                     global_scale=global_scale,
                     g_idx=g_idx,
                     quantization_args=quant_args,
-                    device="cpu",
+                    device=compression_device,
                 )
 
                 # update state dict
                 for key, value in compressed_values.items():
-                    compressed_dict[prefix + key] = value.to(save_device)
+                    compressed_dict[prefix + key] = value.to(compression_device)
 
             else:
                 # omit saving zero points for symmetric or packed quantization
@@ -133,8 +133,7 @@ def compress(
                 # TODO: does this case actually occur?
                 elif name.endswith("g_idx") and torch.any(value <= -1):
                     continue
-
-                compressed_dict[name] = value.to(save_device)
+                compressed_dict[name] = value.to(compression_device)
 
         return compressed_dict
 
 
@@ -220,30 +220,34 @@ def pack_to_int32(
     if num_bits < 1:
         raise ValueError(f"num_bits must be at least 1, got {num_bits}")
 
-    # convert to unsigned for packing
+    # Convert to unsigned range for packing, matching quantization offset
     offset = 1 << (num_bits - 1)
     value = (value + offset).to(torch.uint8)
-    value = value.cpu().numpy().astype(np.uint32)
+    device = value.device
+
     pack_factor = 32 // num_bits
 
-    # pad input tensor and initialize packed output
-    packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
-    padding = packed_size * pack_factor - value.shape[packed_dim]
-    value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
+    if packed_dim == 0:
+        value = value.transpose(0, 1)
 
-    # pack values
-    if packed_dim == 1:
-        packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
-        for i in range(pack_factor):
-            packed |= value[:, i::pack_factor] << num_bits * i
-    else:
-        packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
-        for i in range(pack_factor):
-            packed |= value[i::pack_factor, :] << num_bits * i
+    rows, cols = value.shape
+    padded_cols = math.ceil(cols / pack_factor) * pack_factor
+    pad_len = padded_cols - cols
+
+    if pad_len > 0:
+        value = torch.nn.functional.pad(value, (0, pad_len))
+
+    num_groups = padded_cols // pack_factor
+
+    # Use int32 here
+    reshaped = value.view(rows, num_groups, pack_factor).to(torch.int32)
+    bit_shifts = torch.arange(pack_factor, device=device, dtype=torch.int32) * num_bits
+    packed = (reshaped << bit_shifts).sum(dim=2, dtype=torch.int32)
+
+    if packed_dim == 0:
+        packed = packed.transpose(0, 1)
 
-    # convert back to signed and torch
-    packed = np.ascontiguousarray(packed).view(np.int32)
-    return torch.from_numpy(packed)
+    return packed
 
 
 def unpack_from_int32(
 
@@ -56,8 +56,10 @@ def compress_weight(self, name, value):
         bitmask_tensor = Sparse24BitMaskTensor.from_dense(
             value, self.config.sparsity_structure
         )
-        bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu")
-        return bitmask_dict
+        return bitmask_tensor.dict(
+            name_prefix=name,
+            device="meta" if value.is_meta else "cpu",
+        )
 
     def decompress_weight(self, weight_data):
         data = Sparse24BitMaskTensor.from_compressed_data(**weight_data)
@@ -90,9 +92,14 @@ def from_dense(
         :return: instantiated compressed tensor
         """
         shape = list(tensor.shape)
-        compressed, bitmask = sparse24_bitmask_compress(
-            tensor.cpu(), sparsity_structure=sparsity_structure
-        )
+        if tensor.is_meta:
+            compressed, bitmask = sparse24_bitmask_compress(
+                tensor, sparsity_structure=sparsity_structure
+            )
+        else:
+            compressed, bitmask = sparse24_bitmask_compress(
+                tensor.cpu(), sparsity_structure=sparsity_structure
+            )
         return Sparse24BitMaskTensor(
             shape=shape,
             compressed=compressed,
@@ -169,6 +176,13 @@ def sparse24_bitmask_compress(
         SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
     ), "Only 2:4 sparsity is supported"
 
+    if tensor.is_meta:
+        num_rows, num_cols = tensor.shape
+        compressed_values = torch.empty((num_rows, num_cols // 2), dtype=tensor.dtype, device="meta")
+        packed_cols = (num_cols + 7) // 8
+        bitmasks_packed = torch.empty((num_rows, packed_cols), dtype=torch.uint8, device="meta")
+        return compressed_values, bitmasks_packed
+
     bytemasks = get_24_bytemasks(tensor=tensor)
 
     if tensor.dtype == FP8_DTYPE:
 
@@ -38,8 +38,6 @@
     KV_CACHE_TARGETS,
     infer_quantization_status,
     is_kv_cache_quant_scheme,
-    iter_named_leaf_modules,
-    iter_named_quantizable_modules,
 )
 from compressed_tensors.utils.helpers import fix_fsdp_module_name, replace_module
 from compressed_tensors.utils.offload import update_parameter_data
@@ -87,7 +85,7 @@ def load_pretrained_quantization_parameters(
     model_path = get_safetensors_folder(model_name_or_path)
     mapping = get_quantization_parameter_to_path_mapping(model_path)
 
-    for name, submodule in iter_named_leaf_modules(model):
+    for name, submodule in model.named_modules():
         if not is_module_quantized(submodule):
             continue
         if submodule.quantization_scheme.input_activations is not None:
@@ -152,7 +150,7 @@ def apply_quantization_config(
     # list of submodules to ignore
     ignored_submodules = defaultdict(list)
     # mark appropriate layers for quantization by setting their quantization schemes
-    for name, submodule in model.named_modules():  # child modules and attention modules
+    for name, submodule in model.named_modules():
         # potentially fix module name to remove FSDP wrapper prefix
         name = fix_fsdp_module_name(name)
         if matches := find_name_or_class_matches(name, submodule, config.ignore):
@@ -283,7 +281,7 @@ def expand_target_names(
     """
     return {
         name
-        for name, module in iter_named_leaf_modules(model)
+        for name, module in model.named_modules()
         if is_target(name, module, targets, ignore)
     }
 
@@ -324,6 +322,11 @@ def find_name_or_class_matches(
         2. matches on regex patterns
         3. matches on module names
     """
+    from compressed_tensors import InternalModule
+
+    if isinstance(module, InternalModule):
+        return []
+
     targets = sorted(targets, key=lambda x: ("re:" in x, x))
     if isinstance(targets, Iterable):
         matches = _find_matches(name, targets) + _find_matches(
 
@@ -189,7 +189,7 @@ def _initialize_scale_zero_point(
     else:
         # TODO: consider erroring out in the future as if the dtype if not one of these,
         # there is likely bug
-        if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
+        if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32, torch.float64]:
             scale_dtype = torch.float16
         zp_dtype = quantization_args.pytorch_dtype()
 
 
@@ -22,9 +22,7 @@
     preset_name_to_scheme,
 )
 from compressed_tensors.quantization.utils import (
-    calculate_compression_ratio,
     is_module_quantized,
-    iter_named_quantizable_modules,
     module_type,
     parse_out_kv_cache_args,
 )
@@ -177,9 +175,7 @@ def from_pretrained(
         quantization_status = None
         ignore = {}
         quantization_type_names = set()
-        for name, submodule in iter_named_quantizable_modules(
-            model, include_children=True, include_attn=True
-        ):
+        for name, submodule in model.named_modules():
             layer_type = module_type(submodule)
             if not is_module_quantized(submodule):
                 if layer_type not in ignore: