flashinfer-ai
diff --git a/‎.pre-commit-config.yaml
Lines changed: 11 additions & 13 deletions b/‎.pre-commit-config.yaml
Lines changed: 11 additions & 13 deletions
diff --git a/‎benchmarks/bench_append_paged_kv_cache.py
Lines changed: 5 additions & 5 deletions b/‎benchmarks/bench_append_paged_kv_cache.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎benchmarks/bench_append_paged_mla_kv_cache.py
Lines changed: 5 additions & 5 deletions b/‎benchmarks/bench_append_paged_mla_kv_cache.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎benchmarks/bench_batch_attention.py
Lines changed: 0 additions & 1 deletion b/‎benchmarks/bench_batch_attention.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarks/bench_batch_decode.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/bench_batch_decode.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/bench_blackwell_attention.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/bench_blackwell_attention.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/bench_cutlass_fused_moe.py
Lines changed: 0 additions & 1 deletion b/‎benchmarks/bench_cutlass_fused_moe.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarks/bench_fused_add_rmsnorm.py
Lines changed: 2 additions & 3 deletions b/‎benchmarks/bench_fused_add_rmsnorm.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎benchmarks/bench_mixed_attention.py
Lines changed: 1 addition & 2 deletions b/‎benchmarks/bench_mixed_attention.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/bench_renorm.py
Lines changed: 3 additions & 3 deletions b/‎benchmarks/bench_renorm.py
Lines changed: 3 additions & 3 deletions
@@ -35,18 +35,6 @@ repos:
       - id: remove-crlf
 
   # Formatters
-  - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.8.0
-    hooks:
-      - id: black
-        exclude: flashinfer/tuning_configs/.*\.py
-
-  - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        args: ["--profile=black"] # <-- this one
-
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v19.1.1
     hooks:
@@ -56,6 +44,16 @@ repos:
           (?x)^(3rdparty/.* flashinfer/jit/aot_config.py)$
 
   -   repo: https://github.com/pre-commit/mirrors-mypy
-      rev: ''  # Use the sha / tag you want to point at
+      rev: 'v1.17.1'  # Use the sha / tag you want to point at
       hooks:
       -   id: mypy
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.12.8
+    hooks:
+      # Run the linter.
+      - id: ruff-check
+      # Run the formatter.
+      - id: ruff-format
+        types_or: [ python, pyi ]
@@ -1,6 +1,6 @@
 import argparse
 import dataclasses
-from typing import Tuple, cast
+from typing import Tuple
 
 import numpy as np
 import torch
@@ -139,10 +139,10 @@ def fn() -> None:
             print(
                 f"model: {model_name:8}",
                 f"seqlens: {seqlens!r:{seqlen_strlen}}",
-                f"convert: {convert_latency_ms*1e3:2.0f}us",
-                f"1layer: {latency_ms*1e3:2.0f}us",
-                f"{model.num_layers}layers: {all_layers_latency_ms*1e3:3.0f}us",
-                f"throughput: {throughput*1e-9:8.3f}GB/s",
+                f"convert: {convert_latency_ms * 1e3:2.0f}us",
+                f"1layer: {latency_ms * 1e3:2.0f}us",
+                f"{model.num_layers}layers: {all_layers_latency_ms * 1e3:3.0f}us",
+                f"throughput: {throughput * 1e-9:8.3f}GB/s",
             )
         print("---")
 
 
@@ -1,6 +1,6 @@
 import argparse
 import dataclasses
-from typing import Tuple, cast
+from typing import Tuple
 
 import numpy as np
 import torch
@@ -122,10 +122,10 @@ def fn() -> None:
             print(
                 f"model: {model_name:8}",
                 f"seqlens: {seqlens!r:{seqlen_strlen}}",
-                f"convert: {convert_latency_ms*1e3:2.0f}us",
-                f"1layer: {latency_ms*1e3:2.0f}us",
-                f"{model.num_layers}layers: {all_layers_latency_ms*1e3:3.0f}us",
-                f"throughput: {throughput*1e-9:8.3f}GB/s",
+                f"convert: {convert_latency_ms * 1e3:2.0f}us",
+                f"1layer: {latency_ms * 1e3:2.0f}us",
+                f"{model.num_layers}layers: {all_layers_latency_ms * 1e3:3.0f}us",
+                f"throughput: {throughput * 1e-9:8.3f}GB/s",
             )
         print("---")
 
 
@@ -145,7 +145,6 @@ def main() -> None:
             sweep["num_kv_heads"],
             sweep["num_qo_heads"],
         ):
-
             ms_old, ms_new, mem_MB, bw_old, bw_new = run_bench(
                 kv_lens,
                 qo_lens,
 
@@ -75,7 +75,7 @@ def bench_batch_decode(
         f"batch_size={batch_size}, seq_len={seq_len}, num_qo_heads={num_qo_heads}, num_kv_heads={num_kv_heads}, head_dim={head_dim}, page_block_size={page_block_size}, q_dtype={q_dtype}, kv_dtype={kv_dtype}"
     )
     print(f"execution time: {ms}ms")
-    print(f"memory bandwidth: {io / ms / 1024 / 1024 :.2f} GB/s")
+    print(f"memory bandwidth: {io / ms / 1024 / 1024:.2f} GB/s")
 
 
 if __name__ == "__main__":
 
@@ -61,7 +61,7 @@ def bench_fmha_blackwell(
         q_data_type=dtype,
         kv_data_type=dtype,
     )
-    o = wrapper.run(q, k, v)
+    _o = wrapper.run(q, k, v)
     measurements = bench_gpu_time(
         lambda: wrapper.run(q, k, v),
         dry_run_time_ms=100,
 
@@ -84,7 +84,6 @@ def bench_cutlass_fused_moe(
     n = intermediate_size
     k = hidden_size
     otype = torch.bfloat16
-    wtype = torch.float8_e4m3fn
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=otype) / 10
     w1_cutlass = torch.cat((w1[:, n:, :], w1[:, :n, :]), dim=1).contiguous()
 
 
@@ -1,5 +1,4 @@
 import argparse
-from typing import cast
 
 import numpy as np
 import torch
@@ -54,8 +53,8 @@ def fn() -> None:
                     f"batch_size: {batch_size:3},",
                     f"hidden_size: {hidden_size:5},",
                     f"dtype: {dtype_str:8},",
-                    f"latency: {latency_ms*1e3:2.0f}us,",
-                    f"throughput: {throughput*1e-9:7.3f}GB/s",
+                    f"latency: {latency_ms * 1e3:2.0f}us,",
+                    f"throughput: {throughput * 1e-9:7.3f}GB/s",
                 )
 
         print("---")
 
@@ -213,8 +213,7 @@ def run_bench(
     for idx, (p_q_lens, p_kv_lens, d_q_len, d_kv_len) in enumerate(
         zip(p_q_configs, p_kv_configs, d_q_len_configs, d_kv_len_configs)
     ):
-
-        print(f"===== Benchmark {idx+1}: (kv_len, qo_len) set =====")
+        print(f"===== Benchmark {idx + 1}: (kv_len, qo_len) set =====")
         run_bench(
             p_q_lens,
             p_kv_lens,
 
@@ -49,7 +49,7 @@ def main():
                     io = (probs.numel() * probs.element_size()) * 2
                     bandwidth = io * 1e-6 / ms
                     print(
-                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, p: {p}, duration: {ms*1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, p: {p}, duration: {ms * 1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
                     )
 
     print("---")
@@ -75,7 +75,7 @@ def main():
                     io = (probs.numel() * probs.element_size()) * 2
                     bandwidth = io * 1e-6 / ms
                     print(
-                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, k: {k}, duration: {ms*1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, k: {k}, duration: {ms * 1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
                     )
 
     print("---")
@@ -100,7 +100,7 @@ def main():
                     io = (logits.numel() * logits.element_size()) * 2
                     bandwidth = io * 1e-6 / ms
                     print(
-                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, k: {k}, duration: {ms*1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
+                        f"vocab_size: {vocab_size}, batch_size: {batch_size}, distrib: {distrib.__name__}, k: {k}, duration: {ms * 1e3:.2f} us, effective bandwidth: {bandwidth:.2f} GB/s"
                     )
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ def bench_batch_decode(`
`75`	`75`	`f"batch_size={batch_size}, seq_len={seq_len}, num_qo_heads={num_qo_heads}, num_kv_heads={num_kv_heads}, head_dim={head_dim}, page_block_size={page_block_size}, q_dtype={q_dtype}, kv_dtype={kv_dtype}"`
`76`	`76`	`)`
`77`	`77`	`print(f"execution time: {ms}ms")`
`78`		`- print(f"memory bandwidth: {io / ms / 1024 / 1024 :.2f} GB/s")`
	`78`	`+ print(f"memory bandwidth: {io / ms / 1024 / 1024:.2f} GB/s")`
`79`	`79`
`80`	`80`
`81`	`81`	`if __name__ == "__main__":`
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ def bench_fmha_blackwell(`
`61`	`61`	`q_data_type=dtype,`
`62`	`62`	`kv_data_type=dtype,`
`63`	`63`	`)`
`64`		`- o = wrapper.run(q, k, v)`
	`64`	`+ _o = wrapper.run(q, k, v)`
`65`	`65`	`measurements = bench_gpu_time(`
`66`	`66`	`lambda: wrapper.run(q, k, v),`
`67`	`67`	`dry_run_time_ms=100,`