NVIDIA
diff --git a/‎examples/llm_sparse_attention/README.md‎
Lines changed: 241 additions & 0 deletions b/‎examples/llm_sparse_attention/README.md‎
Lines changed: 241 additions & 0 deletions
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 128 additions & 0 deletions b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 1 addition & 0 deletions b/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,241 @@
+# Sparse Attention for Large Language Models
+
+This example demonstrates how to apply sparse attention optimization to Large Language Models (LLMs) using TensorRT-Model-Optimizer's attention sparsity module.
+
+## Overview
+
+Sparse attention reduces the computational complexity of attention mechanisms by selectively computing only the most important attention scores. This can significantly speed up inference and reduce memory usage, especially for long sequences.
+
+## Features
+
+- **Sparse Attention Method**:
+  - Softmax Skip: Threshold-based masking for efficient attention computation
+  - Extensible architecture: Easy to add new sparse attention methods in the future
+- **Calibration Support**: Automatically find optimal sparsity parameters
+- **HuggingFace Integration**: Works with any HuggingFace transformer model
+- **Composable**: Can be combined with quantization and other optimizations
+
+## Installation
+
+```bash
+pip install nvidia-modelopt transformers torch
+```
+
+## Quick Start
+
+```python
+import modelopt.torch.sparsity as mts  # Similar to mtq for quantization
+from transformers import AutoModelForCausalLM
+
+# Load model
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B")
+
+# Define sparse attention config
+sparse_config = {
+    "method": "softmax_skip",
+    "sparse_cfg": {
+        "*attn*": {"threshold": 1e-4, "enable": True},
+        "default": {"enable": False}
+    }
+}
+
+# Apply sparse attention
+sparse_model = mts.attention_sparsity.sparsify(model, config=sparse_config)
+
+# Use the model as usual
+output = sparse_model.generate(input_ids, max_new_tokens=100)
+```
+
+### Command Line Usage
+
+The `hf_spar_attn.py` script supports two modes:
+
+```bash
+# Basic mode: Apply sparse attention and test generation
+python hf_spar_attn.py --mode basic --model_name Qwen/Qwen3-8B --show_stats
+
+# Export mode: Export to unified HuggingFace checkpoint  
+python hf_spar_attn.py --mode export --model_name Qwen/Qwen3-8B --export_dir ./sparse_model
+```
+
+## Examples
+
+### 1. Basic Usage (--mode basic)
+
+Apply sparse attention to a model and test generation quality:
+
+```bash
+python hf_spar_attn.py --mode basic \
+    --model_name Qwen/Qwen3-8B \
+    --sparse_attn skip_softmax \
+    --show_stats \
+    --benchmark
+```
+
+Options for basic mode:
+
+- `--show_stats`: Display sparsity statistics for each attention layer
+- `--benchmark`: Compare performance before and after sparse attention
+- `--show_memory`: Display GPU memory usage
+
+### 2. Export Model (--mode export)
+
+Export the sparse attention model to unified HuggingFace checkpoint format for deployment:
+
+```bash
+python hf_spar_attn.py --mode export \
+    --model_name Qwen/Qwen3-8B \
+    --sparse_attn skip_softmax \
+    --export_dir ./sparse_model
+```
+
+The exported model will contain:
+
+- Model weights with sparse attention applied
+- `config.json` with `sparse_attention_config` section
+- Tokenizer files
+
+### Exported Config Format
+
+The `config.json` includes a `sparse_attention_config` section using the `config_groups` pattern (similar to `quantization_config`):
+
+**For calibrated models:**
+
+```json
+{
+  "sparse_attention_config": {
+    "config_groups": {
+      "group_0": {
+        "sparse_algo": "softmax_skip",
+        "targets": ["LlamaAttention"]
+      }
+    },
+    "threshold_scale_factor": 437.7,
+    "target_sparsity": 0.3,
+    "producer": {
+      "name": "modelopt",
+      "version": "0.37.0"
+    }
+  }
+}
+```
+
+**For non-calibrated models:**
+
+```json
+{
+  "sparse_attention_config": {
+    "config_groups": {
+      "group_0": {
+        "sparse_algo": "softmax_skip",
+        "threshold": 0.0001,
+        "targets": ["LlamaAttention"]
+      }
+    },
+    "producer": {
+      "name": "modelopt",
+      "version": "0.37.0"
+    }
+  }
+}
+```
+
+This format enables inference engines to reconstruct the sparse attention configuration from the checkpoint.
+
+## Configuration Options
+
+### Pre-defined Configuration
+
+ModelOpt provides a unified configuration that supports both simple and phase-aware thresholds:
+
+```python
+import modelopt.torch.sparsity as mts
+
+# The default config supports phase-aware thresholds
+SOFTMAX_SKIP_CFG = {
+    "method": "softmax_skip",
+    "sparse_cfg": {
+        "*attn*": {
+            "threshold": {
+                "prefill": 1e-3,  # More aggressive during prefill
+                "decode": 1e-5,   # Conservative during decode  
+            },
+            "enable": True,
+        },
+        "default": {"enable": False},
+    },
+}
+
+# Use the config
+model = mts.attention_sparsity.sparsify(model, config=SOFTMAX_SKIP_CFG)
+```
+
+### Custom Configuration
+
+You can create custom configurations with simple or phase-aware thresholds:
+
+```python
+# Simple threshold (same for all phases)
+simple_config = {
+    "method": "softmax_skip",
+    "sparse_cfg": {
+        "*attn*": {
+            "threshold": 1e-4,  # Single threshold for all phases
+            "enable": True,
+        },
+        "default": {"enable": False},
+    }
+}
+
+# Phase-aware threshold
+phase_aware_config = {
+    "method": "softmax_skip",
+    "sparse_cfg": {
+        "*attn*": {
+            "threshold": {
+                "prefill": 1e-3,    # Prefill phase
+                "decode": 1e-5,     # Decode phase
+            },
+            "enable": True,
+        },
+        "default": {"enable": False},
+    }
+}
+```
+
+### Adding Custom Methods
+
+The architecture is designed to easily support new sparse attention methods. Refer to [`FlashSoftmaxSkipMethod`](../../modelopt/torch/sparsity/attention_sparsity/methods/flash_softmax_skip.py) source code for implementing custom methods.
+
+### Pattern-Based Configuration
+
+Apply different configurations to different layers:
+
+```python
+config = {
+    "sparse_cfg": {
+        "*layers.[0-12].*attention*": {"enable": True, "threshold": 1e-3},  # More aggressive for early layers
+        "*layers.[13-24].*attention*": {"enable": True, "threshold": 1e-4},  # Conservative for later layers
+    }
+}
+```
+
+## Performance Considerations
+
+1. **Threshold Tuning**:
+   - Lower thresholds (e.g., 1e-5) preserve more accuracy but less sparsity
+   - Higher thresholds (e.g., 1e-3) provide more sparsity but may impact accuracy
+   - Use calibration to find optimal values
+
+2. **Memory Usage**:
+   - Sparse attention reduces peak memory usage during inference
+   - Especially beneficial for long sequences (>1024 tokens)
+
+3. **Model Compatibility**:
+   - Works best with models using eager attention implementation
+   - Compatible with all HuggingFace transformer models
+
+## References
+
+- [TensorRT-Model-Optimizer Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer)
+- [Sparse Attention Papers Collection](https://github.com/topics/sparse-attention)
@@ -337,6 +337,129 @@ def _export_quantized_weight(
         sub_module.register_buffer(quantizer_attrs.weight_scale, weight_scale)
 
 
+def _get_sparse_attention_config(model: nn.Module) -> dict[str, Any]:
+    """Extract sparse attention configuration from model for export.
+
+    Args:
+        model: Model with sparse attention modules
+
+    Returns:
+        Dictionary with sparse attention config in format:
+        {
+            "config_groups": {
+                "group_0": {
+                    "sparse_algo": "softmax_skip",
+                    "threshold": 1e-4,  # only if not calibrated
+                    "targets": ["LlamaAttention"]
+                }
+            },
+            "threshold_scale_factor": 0.001234,  # global, if calibrated
+            "target_sparsity": 0.5,  # global, if calibrated
+            "producer": {"name": "modelopt", "version": "..."}
+        }
+    """
+    from modelopt import __version__
+    from modelopt.torch.sparsity.attention_sparsity.nn.sparse_attention import SparseAttentionModule
+
+    # Collect all enabled sparse attention modules
+    sparse_modules = []
+    for name, module in model.named_modules():
+        if isinstance(module, SparseAttentionModule) and module.is_enabled:
+            sparse_modules.append((name, module))
+
+    if not sparse_modules:
+        return {}
+
+    sparse_config = {
+        "config_groups": {},
+        "producer": {
+            "name": "modelopt",
+            "version": __version__,
+        },
+    }
+
+    # Check first module for global calibration parameters
+    # (all modules share the same calibration parameters)
+    first_module = sparse_modules[0][1]
+    method_instance = first_module._sparse_method_instance
+    threshold_scale_factor = getattr(method_instance, "threshold_scale_factor", None)
+
+    if threshold_scale_factor is not None:
+        # Model was calibrated: add global calibration parameters
+        sparse_config["threshold_scale_factor"] = float(threshold_scale_factor)
+
+        target_sparsity = getattr(method_instance, "target_sparsity", None)
+        if target_sparsity is not None:
+            sparse_config["target_sparsity"] = float(target_sparsity)
+
+    # Group modules by configuration
+    # Key: (sparse_algo, threshold_repr), Value: list of module class names
+    config_to_targets = {}
+
+    for name, module in sparse_modules:
+        method_instance = module._sparse_method_instance
+
+        # Extract sparse algorithm name from method name
+        # e.g., "flash_softmax_skip" -> "softmax_skip"
+        method_name = method_instance.name
+        if method_name.startswith("flash_"):
+            sparse_algo = method_name[6:]  # Remove "flash_" prefix
+        else:
+            sparse_algo = method_name
+
+        # Get module's original class name for targets
+        # Get the class name before SparseAttentionModule wrapping
+        original_cls = module.get_original_cls_by_level(level=0)
+        target_class_name = original_cls.__name__
+
+        # Build config key for grouping
+        if threshold_scale_factor is None:
+            # Not calibrated: include threshold in grouping
+            threshold_config = getattr(method_instance, "threshold_config", None)
+            if isinstance(threshold_config, dict):
+                # Convert dict to tuple for hashable key
+                threshold_repr = tuple(sorted(threshold_config.items()))
+            else:
+                threshold_repr = threshold_config
+        else:
+            # Calibrated: no threshold in per-layer config
+            threshold_repr = None
+
+        config_key = (sparse_algo, threshold_repr)
+
+        if config_key not in config_to_targets:
+            config_to_targets[config_key] = {
+                "sparse_algo": sparse_algo,
+                "threshold_config": threshold_config if threshold_scale_factor is None else None,
+                "targets": set(),
+            }
+
+        config_to_targets[config_key]["targets"].add(target_class_name)
+
+    # Convert grouped configs to config_groups format
+    for group_idx, ((sparse_algo, threshold_repr), group_data) in enumerate(
+        config_to_targets.items()
+    ):
+        group_name = f"group_{group_idx}"
+        group_config = {
+            "sparse_algo": group_data["sparse_algo"],
+            "targets": sorted(group_data["targets"]),
+        }
+
+        # Add threshold only if not calibrated
+        if group_data["threshold_config"] is not None:
+            threshold_config = group_data["threshold_config"]
+            if isinstance(threshold_config, dict):
+                # Convert to JSON-serializable format
+                group_config["threshold"] = {k: float(v) for k, v in threshold_config.items()}
+            else:
+                group_config["threshold"] = float(threshold_config)
+
+        sparse_config["config_groups"][group_name] = group_config
+
+    return sparse_config
+
+
 def _export_hf_checkpoint(
     model: nn.Module, dtype: torch.dtype | None = None
 ) -> tuple[dict[str, Any], dict[str, Any]]:
@@ -538,6 +661,11 @@ def export_hf_checkpoint(
 
         config_data = set_config_if_spec_decoding(model, config_data)
 
+        # Add sparse attention config if model has sparse attention
+        sparse_attention_config = _get_sparse_attention_config(model)
+        if sparse_attention_config:
+            config_data["sparse_attention_config"] = sparse_attention_config
+
         with open(original_config, "w") as file:
             json.dump(config_data, file, indent=4)
 
 
@@ -172,5 +172,6 @@ def calibrate_sparse_attention(
 
     for module_name, module in sparse_modules:
         module._sparse_method_instance.threshold_scale_factor = scale_factor
+        module._sparse_method_instance.target_sparsity = calib_config.target_sparse_ratio
 
     return {"calibration_results": {name: calibration_result for name, _ in sparse_modules}}