add hf unified ckpt export for sparse attention

kaix-nv · kaix-nv · commit 023159a49b9c · 2025-10-10T14:55:46.000-07:00
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/examples/llm_sparse_attention/README.md b/examples/llm_sparse_attention/README.md
diff --git a/examples/llm_sparse_attention/requirements.txt b/examples/llm_sparse_attention/requirements.txt
@@ -0,0 +1,4 @@
+accelerate
+datasets
+transformers
+
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -338,6 +338,129 @@ def _export_quantized_weight(
         sub_module.register_buffer(quantizer_attrs.weight_scale, weight_scale)
 
 
+def _get_sparse_attention_config(model: nn.Module) -> dict[str, Any]:
+    """Extract sparse attention configuration from model for export.
+
+    Args:
+        model: Model with sparse attention modules
+
+    Returns:
+        Dictionary with sparse attention config in format:
+        {
+            "config_groups": {
+                "group_0": {
+                    "sparse_algo": "softmax_skip",
+                    "threshold": 1e-4,  # only if not calibrated
+                    "targets": ["LlamaAttention"]
+                }
+            },
+            "threshold_scale_factor": 0.001234,  # global, if calibrated
+            "target_sparsity": 0.5,  # global, if calibrated
+            "producer": {"name": "modelopt", "version": "..."}
+        }
+    """
+    from modelopt import __version__
+    from modelopt.torch.sparsity.attention_sparsity.nn.sparse_attention import SparseAttentionModule
+
+    # Collect all enabled sparse attention modules
+    sparse_modules = []
+    for name, module in model.named_modules():
+        if isinstance(module, SparseAttentionModule) and module.is_enabled:
+            sparse_modules.append((name, module))
+
+    if not sparse_modules:
+        return {}
+
+    sparse_config = {
+        "config_groups": {},
+        "producer": {
+            "name": "modelopt",
+            "version": __version__,
+        },
+    }
+
+    # Check first module for global calibration parameters
+    # (all modules share the same calibration parameters)
+    first_module = sparse_modules[0][1]
+    method_instance = first_module._sparse_method_instance
+    threshold_scale_factor = getattr(method_instance, "threshold_scale_factor", None)
+
+    if threshold_scale_factor is not None:
+        # Model was calibrated: add global calibration parameters
+        sparse_config["threshold_scale_factor"] = float(threshold_scale_factor)
+
+        target_sparsity = getattr(method_instance, "target_sparsity", None)
+        if target_sparsity is not None:
+            sparse_config["target_sparsity"] = float(target_sparsity)
+
+    # Group modules by configuration
+    # Key: (sparse_algo, threshold_repr), Value: list of module class names
+    config_to_targets = {}
+
+    for name, module in sparse_modules:
+        method_instance = module._sparse_method_instance
+
+        # Extract sparse algorithm name from method name
+        # e.g., "flash_softmax_skip" -> "softmax_skip"
+        method_name = method_instance.name
+        if method_name.startswith("flash_"):
+            sparse_algo = method_name[6:]  # Remove "flash_" prefix
+        else:
+            sparse_algo = method_name
+
+        # Get module's original class name for targets
+        # Get the class name before SparseAttentionModule wrapping
+        original_cls = module.get_original_cls_by_level(level=0)
+        target_class_name = original_cls.__name__
+
+        # Build config key for grouping
+        if threshold_scale_factor is None:
+            # Not calibrated: include threshold in grouping
+            threshold_config = getattr(method_instance, "threshold_config", None)
+            if isinstance(threshold_config, dict):
+                # Convert dict to tuple for hashable key
+                threshold_repr = tuple(sorted(threshold_config.items()))
+            else:
+                threshold_repr = threshold_config
+        else:
+            # Calibrated: no threshold in per-layer config
+            threshold_repr = None
+
+        config_key = (sparse_algo, threshold_repr)
+
+        if config_key not in config_to_targets:
+            config_to_targets[config_key] = {
+                "sparse_algo": sparse_algo,
+                "threshold_config": threshold_config if threshold_scale_factor is None else None,
+                "targets": set(),
+            }
+
+        config_to_targets[config_key]["targets"].add(target_class_name)
+
+    # Convert grouped configs to config_groups format
+    for group_idx, ((sparse_algo, threshold_repr), group_data) in enumerate(
+        config_to_targets.items()
+    ):
+        group_name = f"group_{group_idx}"
+        group_config = {
+            "sparse_algo": group_data["sparse_algo"],
+            "targets": sorted(group_data["targets"]),
+        }
+
+        # Add threshold only if not calibrated
+        if group_data["threshold_config"] is not None:
+            threshold_config = group_data["threshold_config"]
+            if isinstance(threshold_config, dict):
+                # Convert to JSON-serializable format
+                group_config["threshold"] = {k: float(v) for k, v in threshold_config.items()}
+            else:
+                group_config["threshold"] = float(threshold_config)
+
+        sparse_config["config_groups"][group_name] = group_config
+
+    return sparse_config
+
+
 def _export_hf_checkpoint(
     model: nn.Module, dtype: torch.dtype | None = None
 ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -543,6 +666,11 @@ def export_hf_checkpoint(
 
         config_data["quantization_config"] = hf_quant_config
 
+        # Add sparse attention config if model has sparse attention
+        sparse_attention_config = _get_sparse_attention_config(model)
+        if sparse_attention_config:
+            config_data["sparse_attention_config"] = sparse_attention_config
+
         with open(original_config, "w") as file:
             json.dump(config_data, file, indent=4)
 
diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
@@ -172,5 +172,6 @@ def calibrate_sparse_attention(
 
     for module_name, module in sparse_modules:
         module._sparse_method_instance.threshold_scale_factor = scale_factor
+        module._sparse_method_instance.target_sparsity = calib_config.target_sparse_ratio
 
     return {"calibration_results": {name: calibration_result for name, _ in sparse_modules}}
diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
@@ -140,9 +140,9 @@ def calibrate(self, model: nn.Module, forward_loop: Callable) -> dict[str, Any]:
 
         print(f"Collected statistics for {len(self.sparsity_results)} samples")
 
-        # Stage 2: Find optimal threshold for each sample and compute 'a'
+        # Stage 2: Find optimal threshold for each sample and compute scale factor
         print(
-            f"\nStage 2: Finding 'a' parameter for target sparsity {self.target_sparse_ratio:.2f}"
+            f"\nStage 2: Finding threshold scale factor for target sparsity {self.target_sparse_ratio:.2f}"
         )
 
         # Find optimal threshold for each sample
diff --git a/modelopt/torch/sparsity/attention_sparsity/config.py b/modelopt/torch/sparsity/attention_sparsity/config.py
@@ -279,9 +279,9 @@ def validate_num_length_bins(cls, v):
             "backend": "pytorch",  # Only pytorch backend supported
             "enable": True,
             "calibration": {
-                "target_sparse_ratio": 0.5,
-                "samples": 120,
-                "max_seqlen": 8192,
+                "target_sparse_ratio": 0.3,
+                "samples": 12,
+                "max_seqlen": 1024,
             },
         },
         "default": {"enable": False},
diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -213,12 +213,23 @@ def restore_sparse_attention_state(model: nn.Module, state_dict: dict[str, Any])
                     module._method = module_state["method"]
                 if "method_config" in module_state:
                     # Restore config attributes
+                    # Separate method instance attributes from module attributes
+                    method_instance_attrs = {"threshold_scale_factor", "target_sparsity"}
+
                     for key, val in module_state["method_config"].items():
-                        setattr(module, f"_{key}", val)
+                        if key not in method_instance_attrs:
+                            # Set on module
+                            setattr(module, f"_{key}", val)
 
                 # Re-setup with restored config
                 module._setup()
 
+                # Restore method instance attributes after _setup
+                if "method_config" in module_state:
+                    for key, val in module_state["method_config"].items():
+                        if key in {"threshold_scale_factor", "target_sparsity"}:
+                            setattr(module._sparse_method_instance, key, val)
+
 
 def update_sparse_attention_metadata(
     model: nn.Module, config: SparseAttentionConfig, metadata: MetadataDict
@@ -243,8 +254,16 @@ def update_sparse_attention_metadata(
                 if k.startswith("_") and k not in ("_method", "_enabled", "_sparse_method_instance")
             }
 
+            # Also collect calibration-related attributes from method instance
+            method_instance = module._sparse_method_instance
+            for attr in ["threshold_scale_factor", "target_sparsity"]:
+                if hasattr(method_instance, attr):
+                    val = getattr(method_instance, attr)
+                    if val is not None:
+                        method_config[attr] = val
+
             module_state = {
-                "method": module._sparse_method_instance.name,
+                "method": method_instance.name,
                 "method_config": method_config,
             }
 
diff --git a/tests/_test_utils/torch_sparsity/sparse_attention_common.py b/tests/_test_utils/torch_sparsity/sparse_attention_common.py
@@ -93,13 +93,13 @@ def get_input(cls, d_model=128, seq_len=10, batch_size=2):
 # Test configurations
 FLASH_SOFTMAX_SKIP_DEFAULT_CFG = {
     "method": "flash_softmax_skip",
-    "sparse_cfg": {"*attention*": {"threshold": 1e-4, "br": 128, "bc": 128, "enable": True}},
+    "sparse_cfg": {"*attn*": {"threshold": 1e-4, "br": 128, "bc": 128, "enable": True}},
 }
 
 FLASH_SOFTMAX_SKIP_PHASE_AWARE_CFG = {
     "method": "flash_softmax_skip",
     "sparse_cfg": {
-        "*attention*": {
+        "*attn*": {
             "threshold": {"prefill": 1e-3, "decode": 1e-5},
             "br": 128,
             "bc": 128,
@@ -112,7 +112,7 @@ def get_input(cls, d_model=128, seq_len=10, batch_size=2):
     "method": "flash_softmax_skip",
     "collect_stats": True,
     "sparse_cfg": {
-        "*attention*": {
+        "*attn*": {
             "threshold": 1e-4,
             "br": 128,
             "bc": 128,
@@ -125,7 +125,7 @@ def get_input(cls, d_model=128, seq_len=10, batch_size=2):
 FLASH_SOFTMAX_SKIP_CALIBRATION_CFG = {
     "method": "flash_softmax_skip",
     "sparse_cfg": {
-        "*attention*": {
+        "*attn*": {
             "br": 128,
             "bc": 128,
             "enable": True,
diff --git a/tests/gpu/torch/sparsity/attention_sparsity/test_export_gpu.py b/tests/gpu/torch/sparsity/attention_sparsity/test_export_gpu.py
diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_export_config.py b/tests/unit/torch/sparsity/attention_sparsity/test_export_config.py

Original file line number	Diff line number	Diff line change
`@@ -140,9 +140,9 @@ def calibrate(self, model: nn.Module, forward_loop: Callable) -> dict[str, Any]:`
`140`	`140`
`141`	`141`	`print(f"Collected statistics for {len(self.sparsity_results)} samples")`
`142`	`142`
`143`		`- # Stage 2: Find optimal threshold for each sample and compute 'a'`
	`143`	`+ # Stage 2: Find optimal threshold for each sample and compute scale factor`
`144`	`144`	`print(`
`145`		`- f"\nStage 2: Finding 'a' parameter for target sparsity {self.target_sparse_ratio:.2f}"`
	`145`	`+ f"\nStage 2: Finding threshold scale factor for target sparsity {self.target_sparse_ratio:.2f}"`
`146`	`146`	`)`
`147`	`147`
`148`	`148`	`# Find optimal threshold for each sample`