Address review feedback

kaix-nv · kaix-nv · commit 02182f8990a1 · 2025-12-08T15:59:59.000-08:00
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/examples/llm_sparsity/attention_sparsity/hf_sa.py b/examples/llm_sparsity/attention_sparsity/hf_sa.py
@@ -295,8 +295,8 @@ def main(args):
         "--backend",
         type=str,
         default="pytorch",
-        choices=["pytorch", "triton"],
-        help="Backend to use for sparse attention computation (default: pytorch)",
+        choices=["pytorch"],
+        help="Backend for sparse attention (default: pytorch). More backends coming soon.",
     )
 
     # Sequence length arguments
diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -299,64 +299,3 @@ def enable_sparse_attention(model: nn.Module, wildcard_or_filter_func: str | Cal
 
         if matched:
             module.enable()
-
-
-def print_sparse_attention_summary(model: nn.Module):
-    """Print summary of sparse attention modules in the model.
-
-    Similar to mtq.print_quant_summary for API consistency.
-
-    Args:
-        model: Model with sparse attention applied
-
-    Prints:
-        - Total sparse attention modules
-        - Enabled vs disabled count
-        - Method distribution
-        - Configuration summary by module
-
-    Example:
-        >>> import modelopt.torch.sparsity.attention_sparsity as sparse_attn
-        >>> model = sparse_attn.sparsify(model, config)
-        >>> sparse_attn.print_sparse_attention_summary(model)
-    """
-    sparse_modules = []
-    for name, module in model.named_modules():
-        if isinstance(module, SparseAttentionModule):
-            sparse_modules.append((name, module))
-
-    if not sparse_modules:
-        print("No sparse attention modules found in model")
-        return
-
-    enabled_count = sum(1 for _, m in sparse_modules if m.is_enabled)
-    disabled_count = len(sparse_modules) - enabled_count
-
-    # Count methods
-    method_counts = {}
-    for _, module in sparse_modules:
-        method = getattr(module, "_method", "unknown")
-        method_counts[method] = method_counts.get(method, 0) + 1
-
-    print(f"Total sparse attention modules: {len(sparse_modules)}")
-    print(f"Enabled:  {enabled_count}")
-    print(f"Disabled: {disabled_count}")
-
-    if method_counts:
-        print("\nMethods:")
-        for method, count in sorted(method_counts.items()):
-            print(f"{method}: {count}")
-
-    for name, module in sparse_modules:
-        method = getattr(module, "_method", "unknown")
-        threshold = getattr(module, "_threshold", "N/A")
-
-        # Format threshold nicely
-        if isinstance(threshold, dict):
-            threshold_str = str(threshold)
-        elif isinstance(threshold, float):
-            threshold_str = f"{threshold:.2e}"
-        else:
-            threshold_str = str(threshold)
-
-        print(f"{name}: Method: {method}, Threshold: {threshold_str}")
diff --git a/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py b/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py
@@ -15,17 +15,13 @@
 
 """Dynamic sparse attention registration for HuggingFace models."""
 
-import logging
-
 import torch.nn as nn
 import transformers
 
 from modelopt.torch.opt.dynamic import DynamicModule
 
 from ..sparse_attention import SparseAttentionModule, SparseAttentionRegistry
 
-logger = logging.getLogger(__name__)
-
 
 class _GenericSparseAttention(SparseAttentionModule):
     """Generic sparse attention that works with any HF attention module.
@@ -94,12 +90,10 @@ def register_sparse_attention_on_the_fly(model: nn.Module) -> bool:
                 SparseAttentionRegistry.register({module_type: type_name})(_GenericSparseAttention)
                 attention_types.add(module_type)
                 registered_count += 1
-                logger.info(f"Registered {type_name} for sparse attention optimization")
+                print(f"Registered {type_name} for sparse attention optimization")
 
     if registered_count > 0:
-        logger.info(
-            f"Dynamically registered {registered_count} attention module types for sparsity"
-        )
+        print(f"Dynamically registered {registered_count} attention module types for sparsity")
 
     return registered_count > 0
 
diff --git a/tests/examples/llm_sparsity/attention_sparsity/test_attention_sparsity.py b/tests/examples/llm_sparsity/attention_sparsity/test_attention_sparsity.py
@@ -17,7 +17,6 @@
 
 import pytest
 from _test_utils.examples.run_command import extend_cmd_parts, run_example_command
-from _test_utils.torch.misc import minimum_gpu
 
 
 def run_attention_sparsity_command(*, model: str, method: str = "skip_softmax", **kwargs):
@@ -42,7 +41,6 @@ def run_attention_sparsity_command(*, model: str, method: str = "skip_softmax",
     run_example_command(cmd_parts, "llm_sparsity/attention_sparsity")
 
 
-@minimum_gpu(1)
 @pytest.mark.parametrize("method", ["skip_softmax"])
 def test_attention_sparsity(tiny_llama_path, tmp_path, method):
     """Test sparse attention with TinyLlama."""
diff --git a/tests/gpu/torch/sparsity/attention_sparsity/test_attention_sparsity_gpu.py b/tests/gpu/torch/sparsity/attention_sparsity/test_attention_sparsity_gpu.py
@@ -29,9 +29,6 @@
 
 import modelopt.torch.sparsity.attention_sparsity as sparse_attn
 
-# Skip all tests if GPU is not available
-pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
-
 
 class TestAttentionSparsityGPU:
     """GPU tests for attention sparsity."""
diff --git a/tests/gpu/torch/sparsity/attention_sparsity/test_integration_gpu.py b/tests/gpu/torch/sparsity/attention_sparsity/test_integration_gpu.py
@@ -24,9 +24,6 @@
 from modelopt.torch.sparsity.attention_sparsity import SparseAttentionConfig
 from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
 
-# Skip all tests if GPU is not available
-pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
-
 
 @pytest.fixture(scope="module")
 def tiny_llama_dir(tmp_path_factory):
@@ -35,8 +32,8 @@ def tiny_llama_dir(tmp_path_factory):
         tmp_path_factory.mktemp("tiny_llama"),
         with_tokenizer=True,
         num_hidden_layers=2,  # Minimal layers for fast testing
-        hidden_size=512,
-        intermediate_size=1024,
+        hidden_size=32,
+        intermediate_size=64,
     )
 
 
diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_sparse_attention_config.py b/tests/unit/torch/sparsity/attention_sparsity/test_sparse_attention_config.py
diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_sparse_attention_conversion.py b/tests/unit/torch/sparsity/attention_sparsity/test_sparse_attention_conversion.py
@@ -31,7 +31,6 @@
 from modelopt.torch.sparsity.attention_sparsity.conversion import (
     disable_sparse_attention,
     enable_sparse_attention,
-    print_sparse_attention_summary,
 )
 from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
 
@@ -171,19 +170,6 @@ def test_disable_enable_functions(self):
             if isinstance(module, SparseAttentionModule):
                 assert module.is_enabled
 
-    def test_print_sparse_attention_summary(self, capsys):
-        """Test print_sparse_attention_summary function."""
-        model = SimpleAttentionModel()
-        model = sparse_attn.sparsify(model, FLASH_SKIP_SOFTMAX_DEFAULT_CFG)
-
-        # Print summary
-        print_sparse_attention_summary(model)
-
-        # Capture output
-        captured = capsys.readouterr()
-        assert "Total sparse attention modules:" in captured.out
-        assert "Enabled:" in captured.out
-
     def test_restore_sparse_attention_model(self):
         """Test save/restore via modelopt_state."""
         # Create and sparsify original model