neuralmagic
diff --git a/‎.github/actions/test/action.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/actions/test/action.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/build-test.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/build-test.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 18 additions & 4 deletions b/‎.github/workflows/test.yml‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎.github/workflows/trigger-all.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trigger-all.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 18 additions & 8 deletions b/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py‎
Lines changed: 6 additions & 2 deletions b/‎src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/forward.py‎
Lines changed: 69 additions & 2 deletions b/‎src/compressed_tensors/quantization/lifecycle/forward.py‎
Lines changed: 69 additions & 2 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 37 additions & 2 deletions b/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 37 additions & 2 deletions
@@ -69,11 +69,11 @@ runs:
           echo "::endgroup::"
 
           if [[ "${ENABLE_COVERAGE}" == "true" ]]; then
-            echo "::group::consolidating coverage reports"
-            mkdir -p coverage-results
-            mv .coverage coverage-results/ || echo ".coverage file not found"
-            mv coverage-html coverage-results/ || echo "coverage-html folder not found"
-            mv coverage.json coverage-results/ || echo "coverage.json file not found"
+            echo "::group::check coverage reports"
+            if [ ! -d coverage-html ]; then
+                echo "ERROR: coverage-html folder not found"
+                exit 1
+            fi
             echo "::endgroup::"
           fi
           
 
@@ -25,7 +25,7 @@ on:
 
       # test related parameters
       test_configs:
-        description: "python, label, timeout"
+        description: "python, label, timeout, etc"
         type: string
         required: true
 
@@ -53,6 +53,7 @@ jobs:
             python: ${{ matrix.test_config.python }}
             timeout: ${{ matrix.test_config.timeout }}
             whl: ${{ needs.BUILD.outputs.whl }}
+            code_coverage: ${{ matrix.test_config.code_coverage || false }}
         secrets: inherit
 
     UPLOAD:
 
@@ -70,6 +70,10 @@ jobs:
         permissions:
             contents: 'read'
             id-token: 'write'
+            pages: 'write'
+        environment:
+            name: github-pages
+            url: ${{ steps.coverage.outputs.page_url }}
 
         steps:
 
@@ -134,6 +138,11 @@ jobs:
                   suitename: test-${{ inputs.python }}-${{ inputs.test_label }}
                   code_coverage: ${{ inputs.code_coverage }}
 
+            - name: extra info for summary
+              if: ${{ inputs.code_coverage }}
+              run: |
+                  echo "EXTRA='Code Coverage: https://neuralmagic.github.io/compressed-tensors/'" >> $GITHUB_ENV
+
             - name: summary
               uses: neuralmagic/nm-actions/actions/[email protected]
               if: success() || failure()
@@ -143,6 +152,7 @@ jobs:
                   python: ${{ inputs.python }}
                   whl: ${{ inputs.whl }}
                   test_status: ${{ steps.test.outputs.status }}
+                  extra: ${{ env.EXTRA }}
 
             - name: copy results to GCP
               run: |
@@ -157,9 +167,13 @@ jobs:
                   retention-days: 5
 
             - name: upload coverage report
-              uses: actions/upload-artifact@v4
-              if: (success() || failure()) && inputs.code_coverage
+              uses: actions/upload-pages-artifact@v3
+              if: ${{ inputs.code_coverage }}
               with:
-                  name: coverage-results
-                  path: coverage-results/*
+                  path: coverage-html
                   retention-days: 5
+
+            - name: deploy to Github Pages
+              id: coverage
+              uses: actions/deploy-pages@v4
+              if: ${{ inputs.code_coverage }}
@@ -32,7 +32,7 @@ jobs:
             wf_category: ${{ inputs.wf_category || 'NIGHTLY' }}
             gitref: ${{ inputs.gitref || 'main' }}
             push_to_pypi: ${{ (github.event.schedule == '30 0 * * *') || inputs.push_to_pypi || false }}
-            test_configs: '[{"python":"3.11.4","label":"ubuntu-24.04","timeout":"40"},
+            test_configs: '[{"python":"3.11.4","label":"ubuntu-24.04","timeout":"40","code_coverage":true},
                             {"python":"3.10.12","label":"ubuntu-22.04","timeout":"40"},
                             {"python":"3.9.17","label":"k8s-h100-solo","timeout":"40"},
                             {"python":"3.12.6","label":"k8s-a100-duo","timeout":"40"}]'
 
@@ -392,15 +392,18 @@ def compress_model(self, model: Module):
         for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):
 
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
-                module_device = get_execution_device(module).type
-                is_meta = (module_device == "meta")
+                module_device = get_execution_device(module)
+                is_meta = module_device.type == "meta"
 
                 exec_device = "meta" if is_meta else "cpu"
                 onloading_device = "meta" if is_meta else module_device
 
                 # in the future, support compression on same device
                 with align_module_device(module, execution_device=exec_device):
-                    state_dict = module.state_dict(prefix=f"{prefix}.")
+                    state_dict = {
+                        f"{prefix}.{name}": param
+                        for name, param in module.named_parameters(recurse=False)
+                    }
 
                 # quantization first
                 if prefix in module_to_scheme:
@@ -421,7 +424,7 @@ def compress_model(self, model: Module):
 
                 # remove any existing parameters
                 offload_device = get_offloaded_device(module)
-                for name, _ in list(module.named_parameters()):
+                for name, _ in list(module.named_parameters(recurse=False)):
                     delete_offload_parameter(module, name)
 
                 # replace with compressed parameters
@@ -458,7 +461,10 @@ def decompress_model(self, model: Module):
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
                 # in the future, support decompression on same device
                 with align_module_device(module, execution_device="cpu"):
-                    state_dict = module.state_dict(prefix=f"{prefix}.")
+                    state_dict = {
+                        f"{prefix}.{name}": param
+                        for name, param in module.named_parameters(recurse=False)
+                    }
 
                 # sparsity first
                 if prefix in sparse_compression_targets:
@@ -483,7 +489,7 @@ def decompress_model(self, model: Module):
                 # remove any existing parameters
                 exec_device = get_execution_device(module)
                 offload_device = get_offloaded_device(module)
-                for name, _ in list(module.named_parameters()):
+                for name, _ in list(module.named_parameters(recurse=False)):
                     delete_offload_parameter(module, name)
 
                 # replace with decompressed parameters
@@ -747,12 +753,16 @@ def _replace_weights(self, dense_weight_generator, model: Module):
 
 def map_module_to_scheme(model: Module) -> Dict[str, QuantizationScheme]:
     """
-    Returns a dictionary which maps quantized module names to their quantization schemes
+    Returns a dictionary which maps quantized module names to their quantization
+    schemes. Only includes modules with weight quantization
     """
     return {
         fix_fsdp_module_name(name): module.quantization_scheme
         for name, module in model.named_modules()
-        if is_module_quantized(module)
+        if (
+            hasattr(module, "quantization_scheme")
+            and module.quantization_scheme.weights is not None
+        )
     }
 
 
 
@@ -178,9 +178,13 @@ def sparse24_bitmask_compress(
 
     if tensor.is_meta:
         num_rows, num_cols = tensor.shape
-        compressed_values = torch.empty((num_rows, num_cols // 2), dtype=tensor.dtype, device="meta")
+        compressed_values = torch.empty(
+            (num_rows, num_cols // 2), dtype=tensor.dtype, device="meta"
+        )
         packed_cols = (num_cols + 7) // 8
-        bitmasks_packed = torch.empty((num_rows, packed_cols), dtype=torch.uint8, device="meta")
+        bitmasks_packed = torch.empty(
+            (num_rows, packed_cols), dtype=torch.uint8, device="meta"
+        )
         return compressed_values, bitmasks_packed
 
     bytemasks = get_24_bytemasks(tensor=tensor)
 
@@ -111,11 +111,22 @@ def dequantize(
         elif scale.ndim == 2:
             if scale.shape[1] == 1:
                 args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
-            else:
+            # Scale height matches input or is 1 -> group quantization across columns
+            #
+            # Example 1: scale.shape[0] == 1
+            # x_q: (4, 8), scale: (1, 4) -> 2 columns per group
+            #
+            # Example 2: scale.shape[0] == x_q.shape[0]
+            # x_q: (4, 8), scale: (4, 4) -> 2 elements per group (per row)
+            elif (scale.shape[0] == 1) or (scale.shape[0] == x_q.shape[0]):
                 group_size = int(x_q.shape[1] / scale.shape[1])
                 args = QuantizationArgs(
                     strategy=QuantizationStrategy.GROUP, group_size=group_size
                 )
+            else:
+                args = QuantizationArgs(
+                    strategy=QuantizationStrategy.BLOCK, block_structure=scale.shape
+                )
         else:
             raise ValueError(
                 f"Could not infer a quantization strategy from scale with {scale.ndim} "
@@ -189,7 +200,63 @@ def _process_quantization(
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
 
-    if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+    # blockwise FP8: quantize per 2D block, supports block_structure for static block quant
+    if args.strategy == QuantizationStrategy.BLOCK:
+        original_shape = x.shape
+        rows, cols = x.shape[-2], x.shape[-1]
+        block_height, block_width = args.block_structure
+
+        # Ensure exact division (tensor dimensions must be divisible by block size)
+        if rows % block_height != 0:
+            raise ValueError(
+                f"Tensor height {rows} is not divisible by block_height {block_height}. "
+                f"Block quantization requires exact division."
+            )
+        if cols % block_width != 0:
+            raise ValueError(
+                f"Tensor width {cols} is not divisible by block_width {block_width}. "
+                f"Block quantization requires exact division."
+            )
+
+        # reshape into blocks and transpose to make each block contiguous
+        num_rows_blocks = rows // block_height
+        num_cols_blocks = cols // block_width
+        x_blocks = x.reshape(
+            num_rows_blocks,
+            block_height,
+            num_cols_blocks,
+            block_width,
+        ).transpose(1, 2)
+
+        # expand scale/zero_point for blocks
+        sb = scale.unsqueeze(-1).unsqueeze(-1)
+        zb = zero_point.unsqueeze(-1).unsqueeze(-1) if zero_point is not None else None
+        if do_quantize:
+            # quantize blocks
+            x_blocks = _quantize(
+                x=x_blocks,
+                scale=sb,
+                zero_point=zb,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
+                dtype=dtype,
+                global_scale=global_scale,
+            )
+        if do_dequantize:
+            # dequantize blocks
+            x_blocks = _dequantize(
+                x_q=x_blocks,
+                scale=sb,
+                zero_point=zb,
+                global_scale=global_scale,
+            )
+        # restore original shape
+        output = x_blocks.transpose(1, 2).reshape(original_shape)
+    elif args.strategy in (
+        QuantizationStrategy.GROUP,
+        QuantizationStrategy.TENSOR_GROUP,
+    ):
         n_dims = x.shape
         if len(n_dims) > 2:
             x = x.squeeze(0)
 
@@ -15,6 +15,7 @@
 
 import logging
 import math
+import warnings
 from enum import Enum
 from typing import List, Optional
 
@@ -172,14 +173,43 @@ def _initialize_scale_zero_point(
 
     if base_name == "weight" and weight_shape is not None:
         if quantization_args.strategy == QuantizationStrategy.CHANNEL:
-            # (output_channels, 1)
+            # (output_channels, 1) - only for weights
             expected_shape = (weight_shape[0], 1)
         elif quantization_args.strategy in (
             QuantizationStrategy.TENSOR_GROUP,
             QuantizationStrategy.GROUP,
         ):
+            # GROUP/TENSOR_GROUP for both weights and activations
             num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
             expected_shape = (weight_shape[0], max(num_groups, 1))
+        elif quantization_args.strategy == QuantizationStrategy.BLOCK:
+            # For block quantization, scale shape should match number of blocks - only for weights
+            if quantization_args.block_structure is None:
+                raise ValueError(
+                    "Block quantization requires block_structure to be specified"
+                )
+            block_height, block_width = quantization_args.block_structure
+            rows, cols = weight_shape[-2], weight_shape[-1]
+            num_rows_blocks = math.ceil(rows / block_height)
+            num_cols_blocks = math.ceil(cols / block_width)
+
+            # Warn if dimensions don't divide evenly
+            if rows % block_height != 0 or cols % block_width != 0:
+                warnings.warn(
+                    f"Block quantization: tensor shape {weight_shape} does not divide evenly "
+                    f"by block structure {quantization_args.block_structure}. "
+                    f"Some blocks will be incomplete which may affect quantization quality.",
+                    UserWarning,
+                )
+
+            expected_shape = (num_rows_blocks, num_cols_blocks)
+    elif quantization_args.strategy == QuantizationStrategy.BLOCK:
+        warnings.warn(
+            f"BLOCK quantization not supported for {base_name} activations. "
+            f"Falling back to tensor-level quantization.",
+            UserWarning,
+        )
+        expected_shape = 1
 
     # 3. Identify quantization scale and zp dtype
     scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
@@ -189,7 +219,12 @@ def _initialize_scale_zero_point(
     else:
         # TODO: consider erroring out in the future as if the dtype if not one of these,
         # there is likely bug
-        if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32, torch.float64]:
+        if scale_dtype not in [
+            torch.float16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float64,
+        ]:
             scale_dtype = torch.float16
         zp_dtype = quantization_args.pytorch_dtype()