hao-ai-lab
diff --git a/‎.github/workflows/pr-test.yml‎
Lines changed: 97 additions & 26 deletions b/‎.github/workflows/pr-test.yml‎
Lines changed: 97 additions & 26 deletions
diff --git a/‎csrc/attn/README.md‎
Lines changed: 8 additions & 2 deletions b/‎csrc/attn/README.md‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎csrc/attn/bench/bench_sta.py‎ renamed to ‎csrc/attn/benchmarks/bench_sta.py‎
Lines changed: 32 additions & 36 deletions b/‎csrc/attn/bench/bench_sta.py‎ renamed to ‎csrc/attn/benchmarks/bench_sta.py‎
Lines changed: 32 additions & 36 deletions
diff --git a/‎csrc/attn/bench/bench_vsa.py‎ renamed to ‎csrc/attn/benchmarks/bench_vsa.py‎ b/‎csrc/attn/bench/bench_vsa.py‎ renamed to ‎csrc/attn/benchmarks/bench_vsa.py‎
@@ -14,13 +14,9 @@ on:
       - ".github/workflows/pr-test.yml"
       - "pyproject.toml"
       - "docker/Dockerfile.python3.12"
+      - "csrc/**"
   workflow_dispatch:
     inputs:
-      custom_image:
-        description: "Custom image from this repository (default: fastvideo-dev:py3.12-latest)"
-        required: false
-        default: "fastvideo-dev:py3.12-latest"
-        type: string
       run_encoder_test:
         description: "Run encoder-test"
         required: false
@@ -56,6 +52,16 @@ on:
         required: false
         default: false
         type: boolean
+      run_precision_test_STA:
+        description: "Run precision-test-STA"
+        required: false
+        default: false
+        type: boolean
+      run_precision_test_VSA:
+        description: "Run precision-test-VSA"
+        required: false
+        default: false
+        type: boolean
       run_nightly_test:
         description: "Run nightly-test"
         required: false
@@ -65,6 +71,7 @@ on:
 env:
   PYTHONUNBUFFERED: "1"
 
+
 concurrency:
   group: pr-test-${{ github.ref }}
   cancel-in-progress: true
@@ -84,44 +91,69 @@ jobs:
       training-test: ${{ steps.filter.outputs.training-test }}
       training-test-VSA: ${{ steps.filter.outputs.training-test-VSA }}
       inference-test-STA: ${{ steps.filter.outputs.inference-test-STA }}
+      precision-test-STA: ${{ steps.filter.outputs.precision-test-STA }}
+      precision-test-VSA: ${{ steps.filter.outputs.precision-test-VSA }}
     steps:
       - uses: actions/checkout@v4
       - uses: dorny/paths-filter@v3
         id: filter
         with:
           filters: |
+            # Define reusable path patterns
+            common-paths: &common-paths
+              - 'pyproject.toml'
+              - 'docker/Dockerfile.python3.12'
+            sta-kernel-paths: &sta-kernel-paths
+              - 'csrc/attn/st_attn/**'
+              - 'csrc/attn/setup_sta.py'
+              - 'csrc/attn/config_sta.py'
+              - 'csrc/attn/st_attn.cpp'
+            vsa-kernel-paths: &vsa-kernel-paths
+              - 'csrc/attn/vsa/**'
+              - 'csrc/attn/tk/**'
+              - 'csrc/attn/setup_vsa.py'
+              - 'csrc/attn/config_vsa.py'
+              - 'csrc/attn/vsa.cpp'
+            vsa-paths: &vsa-paths
+              - 'fastvideo/v1/**'
+              - *common-paths
+              - *vsa-kernel-paths
+            
+            # Actual tests
             encoder-test:
               - 'fastvideo/v1/models/encoders/**'
               - 'fastvideo/v1/models/loaders/**'
               - 'fastvideo/v1/tests/encoders/**'
-              - 'pyproject.toml'
-              - 'docker/Dockerfile.python3.12'
+              - *common-paths
             vae-test:
               - 'fastvideo/v1/models/vaes/**'
               - 'fastvideo/v1/models/loaders/**'
               - 'fastvideo/v1/tests/vaes/**'
-              - 'pyproject.toml'
-              - 'docker/Dockerfile.python3.12'
+              - *common-paths
             transformer-test:
               - 'fastvideo/v1/models/dits/**'
               - 'fastvideo/v1/models/loaders/**'
               - 'fastvideo/v1/tests/transformers/**'
               - 'fastvideo/v1/layers/**'
               - 'fastvideo/v1/attention/**'
-              - 'pyproject.toml'
-              - 'docker/Dockerfile.python3.12'
+              - *common-paths
             training-test:
               - 'fastvideo/v1/**'
-              - 'pyproject.toml'
-              - 'docker/Dockerfile.python3.12'
+              - *common-paths
             training-test-VSA:
               - 'fastvideo/v1/**'
-              - 'pyproject.toml'
-              - 'docker/Dockerfile.python3.12'
+              - *common-paths
+              - *vsa-kernel-paths
             inference-test-STA:
               - 'fastvideo/v1/**'
-              - 'pyproject.toml'
-              - 'docker/Dockerfile.python3.12'
+              - *common-paths
+              - *sta-kernel-paths
+            precision-test-STA:
+              - *common-paths
+              - *sta-kernel-paths
+            precision-test-VSA:
+              - *common-paths
+              - *vsa-kernel-paths
 
   encoder-test:
     needs: change-filter
@@ -134,7 +166,7 @@ jobs:
       gpu_type: "NVIDIA A40"
       gpu_count: 1
       volume_size: 100
-      image: "ghcr.io/${{ github.repository }}/${{ github.event.inputs.custom_image || 'fastvideo-dev:py3.12-latest' }}"
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
       test_command: "uv pip install -e .[test] && pytest ./fastvideo/v1/tests/encoders -s"
       timeout_minutes: 30
     secrets:
@@ -152,7 +184,7 @@ jobs:
       gpu_type: "NVIDIA A40"
       gpu_count: 1
       volume_size: 100
-      image: "ghcr.io/${{ github.repository }}/${{ github.event.inputs.custom_image || 'fastvideo-dev:py3.12-latest' }}"
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
       test_command: "uv pip install -e .[test] && pytest ./fastvideo/v1/tests/vaes -s"
       timeout_minutes: 30
     secrets:
@@ -170,7 +202,7 @@ jobs:
       gpu_type: "NVIDIA L40S"
       gpu_count: 1
       volume_size: 100
-      image: "ghcr.io/${{ github.repository }}/${{ github.event.inputs.custom_image || 'fastvideo-dev:py3.12-latest' }}"
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
       test_command: "uv pip install -e .[test] && pytest ./fastvideo/v1/tests/transformers -s"
       timeout_minutes: 30
     secrets:
@@ -216,7 +248,7 @@ jobs:
       gpu_count: 4
       volume_size: 100
       disk_size: 100
-      image: "ghcr.io/${{ github.repository }}/${{ github.event.inputs.custom_image || 'fastvideo-dev:py3.12-latest' }}"
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
       test_command: "wandb login $WANDB_API_KEY && uv pip install -e .[test] && pytest ./fastvideo/v1/tests/training/Vanilla -srP"
       timeout_minutes: 30
     secrets:
@@ -236,7 +268,7 @@ jobs:
       gpu_count: 1
       volume_size: 100
       disk_size: 100
-      image: "ghcr.io/${{ github.repository }}/${{ github.event.inputs.custom_image || 'fastvideo-dev:py3.12-latest' }}"
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
       test_command: "wandb login $WANDB_API_KEY && uv pip install -e .[test] && pytest ./fastvideo/v1/tests/training/VSA -srP"
       timeout_minutes: 30
     secrets:
@@ -256,13 +288,51 @@ jobs:
       gpu_count: 1
       volume_size: 100
       disk_size: 100
-      image: "ghcr.io/${{ github.repository }}/${{ github.event.inputs.custom_image || 'fastvideo-dev:py3.12-latest' }}"
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
       test_command: "uv pip install -e .[test] && pytest ./fastvideo/v1/tests/inference/STA -srP"
       timeout_minutes: 30
     secrets:
       RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
       RUNPOD_PRIVATE_KEY: ${{ secrets.RUNPOD_PRIVATE_KEY }}
 
+  precision-test-STA:
+    needs: change-filter
+    if: >-
+      (github.event_name != 'workflow_dispatch' && github.event.pull_request.draft == false) || 
+      (github.event_name == 'workflow_dispatch' && github.event.inputs.run_precision_test_STA == 'true')
+    uses: ./.github/workflows/runpod-test.yml
+    with:
+      job_id: "precision-test-STA"
+      gpu_type: "NVIDIA H100 NVL"
+      gpu_count: 1
+      volume_size: 100
+      disk_size: 100
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
+      test_command: "uv pip install -e .[test] && python csrc/attn/tests/test_sta.py"
+      timeout_minutes: 30
+    secrets:
+      RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
+      RUNPOD_PRIVATE_KEY: ${{ secrets.RUNPOD_PRIVATE_KEY }}
+
+  precision-test-VSA:
+    needs: change-filter
+    if: >-
+      (github.event_name != 'workflow_dispatch' && github.event.pull_request.draft == false) || 
+      (github.event_name == 'workflow_dispatch' && github.event.inputs.run_precision_test_VSA == 'true')
+    uses: ./.github/workflows/runpod-test.yml
+    with:
+      job_id: "precision-test-VSA"
+      gpu_type: "NVIDIA H100 NVL"
+      gpu_count: 1
+      volume_size: 100
+      disk_size: 100
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
+      test_command: "uv pip install -e .[test] && python csrc/attn/tests/test_block_sparse.py"
+      timeout_minutes: 30
+    secrets:
+      RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
+      RUNPOD_PRIVATE_KEY: ${{ secrets.RUNPOD_PRIVATE_KEY }}
+
   nightly-test:
     if: >-
       (github.event_name == 'workflow_dispatch' && github.event.inputs.run_nightly_test == 'true')
@@ -273,7 +343,7 @@ jobs:
       gpu_count: 4
       volume_size: 100
       disk_size: 100
-      image: "ghcr.io/${{ github.repository }}/${{ github.event.inputs.custom_image || 'fastvideo-dev:py3.12-latest' }}"
+      image: "ghcr.io/${{ github.repository }}/fastvideo-dev:py3.12-latest"
       test_command: "wandb login $WANDB_API_KEY && uv pip install -e .[test] && pytest ./fastvideo/v1/tests/nightly/test_e2e_overfit_single_sample.py -vs"
       timeout_minutes: 30
     secrets:
@@ -282,7 +352,8 @@ jobs:
       WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
 
   runpod-cleanup:
-    needs: [encoder-test, vae-test, transformer-test, ssim-test] # Add other jobs to this list as you create them
+    # Add other jobs to this list as you create them
+    needs: [encoder-test, vae-test, transformer-test, ssim-test, training-test, training-test-VSA, inference-test-STA, precision-test-STA, precision-test-VSA] 
     if: ${{ always() && ((github.event_name != 'workflow_dispatch' && github.event.pull_request.draft == false) || github.event_name == 'workflow_dispatch') }}
     runs-on: ubuntu-latest
     steps:
@@ -299,7 +370,7 @@ jobs:
 
       - name: Cleanup all RunPod instances
         env:
-          JOB_IDS: '["encoder-test", "vae-test", "transformer-test", "ssim-test-py3.10", "ssim-test-py3.11", "ssim-test-py3.12"]'
+          JOB_IDS: '["encoder-test", "vae-test", "transformer-test", "ssim-test-py3.10", "ssim-test-py3.11", "ssim-test-py3.12", "training-test", "training-test-VSA", "inference-test-STA", "precision-test-STA", "precision-test-VSA"]'
           RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
           GITHUB_RUN_ID: ${{ github.run_id }}
         run: python .github/scripts/runpod_cleanup.py
@@ -4,7 +4,7 @@
 
 
 ## Installation
-We test our code on Pytorch 2.5.0 and CUDA>=12.4. Currently we only have implementation on H100.
+We test our code on Pytorch 2.5.0 and CUDA>=12.4. Currently we only support H100/H200, because ThunderKittens uses TMA but doesn't support Blackwell yet.
 First, install C++20 for ThunderKittens:
 ```bash
 sudo apt update
@@ -53,8 +53,14 @@ out = sliding_tile_attention(q, k, v, window_size, 0, False)
 
 ## Test
 ```bash
-python test/test_sta.py
+python tests/test_sta.py # test STA
+python tests/test_block_sparse.py # test VSA
 ```
+## Benchmark
+```bash
+python benchmarks/bench_sta.py
+```
+
 
 ## How Does STA Work?
 We give a demo for 2D STA with window size (6,6) operating on a (10, 10) image. 
 
@@ -5,6 +5,7 @@
 import numpy as np
 import torch
 from st_attn import sliding_tile_attention
+from triton.testing import do_bench
 
 
 def flops(batch, seqlen, nheads, headdim, causal, mode="fwd"):
@@ -13,55 +14,48 @@ def flops(batch, seqlen, nheads, headdim, causal, mode="fwd"):
     return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
 
 
-def efficiency(flop, time):
-    flop = flop / 1e12
-    time = time / 1e6
-    return flop / time
+def compute_TFLOPS(flops, ms):
+    flops = flops / 1e12
+    ms = ms / 1e3
+    return flops / ms
 
 
 def benchmark_attention(configurations):
     results = {'fwd': defaultdict(list), 'bwd': defaultdict(list)}
 
-    for B, H, N, D, causal in configurations:
+    for B, H, N, D, causal, dit_seq_shape, window_size in configurations:
         print("=" * 60)
         print(f"Timing forward and backward pass for B={B}, H={H}, N={N}, D={D}, causal={causal}")
 
         q = torch.randn(B, H, N, D, dtype=torch.bfloat16, device='cuda', requires_grad=False).contiguous()
         k = torch.randn(B, H, N, D, dtype=torch.bfloat16, device='cuda', requires_grad=False).contiguous()
         v = torch.randn(B, H, N, D, dtype=torch.bfloat16, device='cuda', requires_grad=False).contiguous()
 
-        grad_output = torch.randn_like(q, requires_grad=False).contiguous()
+        # grad_output = torch.randn_like(q, requires_grad=False).contiguous()
+        # qg = torch.zeros_like(q, requires_grad=False, dtype=torch.float).contiguous()
+        # kg = torch.zeros_like(k, requires_grad=False, dtype=torch.float).contiguous()
+        # vg = torch.zeros_like(v, requires_grad=False, dtype=torch.float).contiguous()
 
-        qg = torch.zeros_like(q, requires_grad=False, dtype=torch.float).contiguous()
-        kg = torch.zeros_like(k, requires_grad=False, dtype=torch.float).contiguous()
-        vg = torch.zeros_like(v, requires_grad=False, dtype=torch.float).contiguous()
 
-        # Prepare for timing forward pass
-        start_events_fwd = [torch.cuda.Event(enable_timing=True) for _ in range(10)]
-        end_events_fwd = [torch.cuda.Event(enable_timing=True) for _ in range(10)]
-
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-
-        # Warmup for forward pass
-        for _ in range(10):
-            o = sliding_tile_attention(q, k, v, [[3, 6, 10]] * 24, 0, False, '18x48x80')
+        # # Warmup for forward pass
+        # for _ in range(10):
+        #     o = sliding_tile_attention(q, k, v, [[3, 6, 10]] * 24, 0, False, dit_seq_shape)
 
-        # Time the forward pass
-        for i in range(10):
-            start_events_fwd[i].record()
-            o = sliding_tile_attention(q, k, v, [[3, 6, 10]] * 24, 0, False, '18x48x80')
-            end_events_fwd[i].record()
+        # # Time the forward pass
+        # for i in range(10):
+        #     start_events_fwd[i].record()
+        #     o = sliding_tile_attention(q, k, v, [[3, 6, 10]] * 24, 0, False, dit_seq_shape)
+        #     end_events_fwd[i].record()
+        ms = do_bench(lambda: sliding_tile_attention(q, k, v, [window_size] * 24, 0, False, dit_seq_shape))
 
-        torch.cuda.synchronize()
-        times_fwd = [s.elapsed_time(e) for s, e in zip(start_events_fwd, end_events_fwd)]
-        time_us_fwd = np.mean(times_fwd) * 1000
+        # times_fwd = [s.elapsed_time(e) for s, e in zip(start_events_fwd, end_events_fwd)]
+        # time_us_fwd = np.mean(times_fwd) * 1000
 
-        tflops_fwd = efficiency(flops(B, N, H, D, causal, 'fwd'), time_us_fwd)
+        tflops_fwd = compute_TFLOPS(flops(B, N, H, D, causal, 'fwd'), ms)
         results['fwd'][(D, causal)].append((N, tflops_fwd))
 
-        print(f"Average time for forward pass in us: {time_us_fwd:.2f}")
-        print(f"Average efficiency for forward pass in TFLOPS: {tflops_fwd}")
+        print(f"Average time for forward pass (ms): {ms:.2f}")
+        print(f"Average TFLOPS: {tflops_fwd}")
         print("-" * 60)
 
         # torch.cuda.empty_cache()
@@ -85,15 +79,14 @@ def benchmark_attention(configurations):
         # times_bwd = [s.elapsed_time(e) for s, e in zip(start_events_bwd, end_events_bwd)]
         # time_us_bwd = np.mean(times_bwd) * 1000
 
-        # tflops_bwd = efficiency(flops(B, N, H, D, causal, 'bwd'), time_us_bwd)
+        # tflops_bwd = compute_TFLOPS(flops(B, N, H, D, causal, 'bwd'), ms)
         # results['bwd'][(D, causal)].append((N, tflops_bwd))
 
-        # print(f"Average time for backward pass in us: {time_us_bwd:.2f}")
-        # print(f"Average efficiency for backward pass in TFLOPS: {tflops_bwd}")
-        print("=" * 60)
+        # print(f"Average time for backward pass(ms): {ms:.2f}")
+        # print(f"Average TFLOPS: {tflops_bwd}")
+        # print("=" * 60)
 
         torch.cuda.empty_cache()
-        torch.cuda.synchronize()
 
     return results
 
@@ -124,7 +117,10 @@ def plot_results(results):
 
 # Example list of configurations to test
 configurations = [
-    (2, 24, 69120, 128, False),
+    (2, 24, 69120, 128, False, '18x48x80', [3, 6, 10]),
+    (2, 24, 69120, 128, True, '18x48x80', [3, 6, 10]),
+    (2, 24, 82944, 128, False, '36x48x48', [3, 3, 6]), # Stepvideo
+    (2, 24, 82944, 128, True, '36x48x48', [3, 3, 6]),
     # (16, 16, 768*16,  128, False),
     # (16, 16, 768*2,  128, False),
     # (16, 16, 768*4,  128, False),