flashinfer-ai
diff --git a/‎.pre-commit-config.yaml
Lines changed: 5 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/bench_mixed_attention.py
Lines changed: 2 additions & 2 deletions b/‎benchmarks/bench_mixed_attention.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎ci/scripts/jenkins/git_utils.py
Lines changed: 9 additions & 12 deletions b/‎ci/scripts/jenkins/git_utils.py
Lines changed: 9 additions & 12 deletions
diff --git a/‎docs/conf.py
Lines changed: 3 additions & 6 deletions b/‎docs/conf.py
Lines changed: 3 additions & 6 deletions
diff --git a/‎flashinfer/artifacts.py
Lines changed: 1 addition & 1 deletion b/‎flashinfer/artifacts.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎flashinfer/autotuner.py
Lines changed: 14 additions & 10 deletions b/‎flashinfer/autotuner.py
Lines changed: 14 additions & 10 deletions
diff --git a/‎flashinfer/comm/mnnvl.py
Lines changed: 10 additions & 9 deletions b/‎flashinfer/comm/mnnvl.py
Lines changed: 10 additions & 9 deletions
diff --git a/‎flashinfer/comm/trtllm_ar.py
Lines changed: 9 additions & 6 deletions b/‎flashinfer/comm/trtllm_ar.py
Lines changed: 9 additions & 6 deletions
diff --git a/‎flashinfer/cudnn/prefill.py
Lines changed: 1 addition & 5 deletions b/‎flashinfer/cudnn/prefill.py
Lines changed: 1 addition & 5 deletions
diff --git a/‎flashinfer/decode.py
Lines changed: 12 additions & 12 deletions b/‎flashinfer/decode.py
Lines changed: 12 additions & 12 deletions
@@ -54,3 +54,8 @@ repos:
         types_or: [c++, c, cuda]
         exclude: |
           (?x)^(3rdparty/.* flashinfer/jit/aot_config.py)$
+
+  -   repo: https://github.com/pre-commit/mirrors-mypy
+      rev: ''  # Use the sha / tag you want to point at
+      hooks:
+      -   id: mypy
@@ -185,8 +185,8 @@ def run_bench(
         full_kv_len = np.random.randint(2000, 16000, size=bsz)
         p_q_lens = []
         p_kv_lens = []
-        d_q_len = []
-        d_kv_len = []
+        d_q_lens = []
+        d_kv_lens = []
 
         for i in range(bsz):
             if i % stride == 0:
 
@@ -30,13 +30,13 @@
 
 def compress_query(query: str) -> str:
     query = query.replace("\n", "")
-    query = re.sub("\s+", " ", query)
+    query = re.sub(r"\s+", " ", query)
     return query
 
 
 def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None):
     logging.info(f"Requesting POST to {url} with {body}")
-    headers = {}
+    headers: Dict[Any, Any] = {}
     req = request.Request(url, headers=headers, method="POST")
     if auth is not None:
         auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}".encode())
@@ -46,9 +46,8 @@ def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] =
         body = ""
 
     req.add_header("Content-Type", "application/json; charset=utf-8")
-    data = json.dumps(body)
-    data = data.encode("utf-8")
-    req.add_header("Content-Length", len(data))
+    data = json.dumps(body).encode("utf-8")
+    req.add_header("Content-Length", str(len(data)))
 
     with request.urlopen(req, data) as response:
         return response.read()
@@ -119,9 +118,8 @@ def _request(
         logging.info(f"Requesting {method} to {full_url} with {body}")
         req = request.Request(full_url, headers=self.headers(), method=method.upper())
         req.add_header("Content-Type", "application/json; charset=utf-8")
-        data = json.dumps(body)
-        data = data.encode("utf-8")
-        req.add_header("Content-Length", len(data))
+        data = json.dumps(body).encode("utf-8")
+        req.add_header("Content-Length", str(len(data)))
 
         try:
             with request.urlopen(req, data) as response:
@@ -206,12 +204,11 @@ def find_ccs(body: str) -> List[str]:
     matches = re.findall(r"(cc( @[-A-Za-z0-9]+)+)", body, flags=re.MULTILINE)
     matches = [full for full, last in matches]
 
-    reviewers = []
+    reviewers = set()
     for match in matches:
         if match.startswith("cc "):
             match = match.replace("cc ", "")
         users = [x.strip() for x in match.split("@")]
-        reviewers += users
+        reviewers.update(users)
 
-    reviewers = set(x for x in reviewers if x != "")
-    return list(reviewers)
+    return [x for x in reviewers if x != ""]
@@ -2,6 +2,7 @@
 import sys
 import warnings
 from pathlib import Path
+from typing import Any, List
 
 # import tlcpack_sphinx_addon
 # Configuration file for the Sphinx documentation builder.
@@ -64,16 +65,12 @@
 
 html_theme = "furo"  # "sphinx_rtd_theme"
 
-templates_path = []
+templates_path: List[Any] = []
 
-html_static_path = []
+html_static_path = ["_static"]
 
 html_theme_options = {
     "logo_only": True,
-}
-
-html_static_path = ["_static"]
-html_theme_options = {
     "light_logo": "FlashInfer-white-background.png",
     "dark_logo": "FlashInfer-black-background.png",
 }
@@ -19,7 +19,7 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-import requests
+import requests  # type: ignore[import-untyped]
 
 from .jit.core import logger
 from .jit.cubin_loader import FLASHINFER_CUBINS_REPOSITORY, get_cubin
 
@@ -583,20 +583,22 @@ def _optimization_profiles(
 
         generated_profiles: List[OptimizationProfile] = []
 
-        dynamic_dims = []
+        dynamic_dims: List[Tuple[Any, ...]] = []
 
         for spec in tuning_config.dynamic_tensor_specs:
             assert inspect.isfunction(spec.gen_tuning_buckets) or isinstance(
                 spec.gen_tuning_buckets, (list, tuple)
             ), "The given dynamic dimension must provide a opt value generation function or a list of opt values"
             if inspect.isfunction(spec.gen_tuning_buckets):
                 opt_shapes = spec.gen_tuning_buckets(
-                    base_profile.shapes[spec.input_idx][spec.dim_idx].val
+                    base_profile.shapes[spec.input_idx][spec.dim_idx]._opt()
                 )
             else:
                 opt_shapes = spec.gen_tuning_buckets
-            opt_shapes_max = tuple(opt_shapes[1:]) + (float("inf"),)
-            opt_shapes_max = {v1: v2 for v1, v2 in zip(opt_shapes, opt_shapes_max)}
+            opt_shapes_max = {
+                v1: v2
+                for v1, v2 in zip(opt_shapes, tuple(opt_shapes[1:]) + (float("inf"),))
+            }
             dynamic_dims.append(
                 (spec.input_idx, spec.dim_idx, opt_shapes_max, opt_shapes)
             )
@@ -617,10 +619,12 @@ def _optimization_profiles(
                 )
 
             # Adjust the profile to satisfy the constraints
-            for spec in tuning_config.constraint_specs:
-                min_value = opt_value = max_value = spec.infer_shape(p.get_opt_shapes())
-                p.shapes[spec.input_idx][spec.dim_idx] = DynamicDim(
-                    min_value, opt_value, max_value
+            for constraint_spec in tuning_config.constraint_specs:
+                min_value = opt_value = max_value = constraint_spec.infer_shape(
+                    p.get_opt_shapes()
+                )
+                p.shapes[constraint_spec.input_idx][constraint_spec.dim_idx] = (
+                    DynamicDim(min_value, opt_value, max_value)
                 )
             generated_profiles.append(p)
             logger.debug(f"[Autotuner]: generated profile: {p}")
@@ -651,8 +655,8 @@ def _find_nearest_profile(
             )
 
         # associated dimensions dependent on other free dynamic dimensions, so assign -1 in the profile
-        for spec in tuning_config.constraint_specs:
-            base_profile[spec.input_idx][spec.dim_idx] = -1
+        for constraint_spec in tuning_config.constraint_specs:
+            base_profile[constraint_spec.input_idx][constraint_spec.dim_idx] = -1
 
         return tuple(tuple(shape) for shape in base_profile)
 
 
@@ -19,7 +19,7 @@
 import platform
 import sys
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 import torch
 from cuda import cuda
@@ -132,16 +132,17 @@ def alloc_and_copy_to_cuda(host_ptr_array: List[int]) -> Optional[int]:
 
 
 if IS_BUILDING_DOCS:
+    # Mock classes for building docs
 
-    class MpiComm:
+    class MpiComm:  # type: ignore[no-redef]
         @classmethod
         def set_mpi_comm(cls, new_comm):
             pass
 
         def __getattr__(self, name):
             return None
 
-    class MnnvlMemory:
+    class MnnvlMemory:  # type: ignore[no-redef]
         initialized: bool = False
 
         current_mem_offset: int = 0
@@ -159,8 +160,8 @@ class MnnvlMemory:
 
         dev_id: int = None
 
-        allocated_map = {}
-        address_refcnt = {}
+        allocated_map: Dict[int, Any] = {}
+        address_refcnt: Dict[int, Any] = {}
 
         def __init__(self, mapping: Mapping, size: int):
             pass
@@ -211,7 +212,7 @@ def supports_mnnvl() -> bool:
     import pynvml
     from mpi4py import MPI
 
-    class MpiComm:
+    class MpiComm:  # type: ignore[no-redef]
         _comm: MPI.Intracomm = MPI.COMM_WORLD
 
         @classmethod
@@ -221,7 +222,7 @@ def set_mpi_comm(cls, new_comm: MPI.Intracomm):
         def __getattr__(self, name):
             return getattr(self._comm, name)
 
-    class MnnvlMemory:
+    class MnnvlMemory:  # type: ignore[no-redef]
         initialized: bool = False
 
         current_mem_offset: int = 0
@@ -239,8 +240,8 @@ class MnnvlMemory:
 
         dev_id: int = None
 
-        allocated_map = {}
-        address_refcnt = {}
+        allocated_map: Dict[int, Any] = {}
+        address_refcnt: Dict[int, Any] = {}
 
         def __init__(self, mapping: Mapping, size: int):
             self.mapping = mapping
 
@@ -16,6 +16,7 @@
 
 import functools
 import logging
+from ctypes import c_void_p
 from dataclasses import dataclass
 from types import SimpleNamespace
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -417,7 +418,7 @@ def trtllm_create_ipc_workspace_for_all_reduce(
     max_token_num: int,
     hidden_dim,
     group: Optional[ProcessGroup] = None,
-) -> List[int]:
+) -> List[List[int]]:
     """
     Parameters:
     - rank: the rank of the current process.
@@ -492,7 +493,7 @@ def trtllm_create_ipc_workspace_for_all_reduce(
 
 
 def trtllm_destroy_ipc_workspace_for_all_reduce(
-    workspace: List[int], group: Optional[ProcessGroup] = None
+    workspace: List[List[int]], group: Optional[ProcessGroup] = None
 ) -> None:
     """
     Note:
@@ -518,7 +519,7 @@ def trtllm_create_ipc_workspace_for_all_reduce_fusion(
     hidden_dim,
     use_fp32_lamport: bool = False,
     group: Optional[ProcessGroup] = None,
-) -> List[int]:
+) -> Tuple[List[List[int]], torch.Tensor]:
     """
     Parameters:
     - tp_rank: the rank of the current process.
@@ -564,7 +565,7 @@ def trtllm_create_ipc_workspace_for_all_reduce_fusion(
     # we should init 3 buffers for all reduce fusion:
     # [buffer_size, flag_size, lamport_buffer_size]
 
-    ipc_handles = list()
+    ipc_handles: List[List[int]] = list()
     for size in [buffer_size, flag_size, lamport_buffer_size]:
         # todo(review): confirm we need this alignment
         # all sizes should be aligned to 1LU << 21 bytes (2MB)
@@ -609,7 +610,9 @@ def trtllm_create_ipc_workspace_for_all_reduce_fusion(
     cudart.cudaMemset(flag_ptr, 0, 5 * 4)
     # Set flag_ptr[3] = lamport_comm_size
     lamport_comm_size_bytes = lamport_comm_size.to_bytes(4, byteorder="little")
-    cudart.cudaMemcpy(flag_ptr.value + 3 * 4, lamport_comm_size_bytes, 4)
+    cudart.cudaMemcpy(
+        c_void_p(flag_ptr.value + 3 * 4), c_void_p(lamport_comm_size_bytes), 4
+    )
     print("set flag_ptr[3] = lamport_comm_size: ", lamport_comm_size)
     # add flag_ptr to workspace
     workspace.append(flag_ptr.value)
@@ -628,7 +631,7 @@ def trtllm_create_ipc_workspace_for_all_reduce_fusion(
 
 
 def trtllm_destroy_ipc_workspace_for_all_reduce_fusion(
-    workspace: List[int], group: Optional[ProcessGroup] = None
+    workspace: List[List[int]], group: Optional[ProcessGroup] = None
 ) -> None:
     """
     Parameters:
 
@@ -143,11 +143,7 @@ def _build_prefill_graph(
             if q.dim() == 3:
                 h_qo, d_qk = q.shape[1], q.shape[2]
             elif q.dim() == 4:
-                h_qo, d_qk = (
-                    q.shape[1],
-                    q.shape[2],
-                    q.shape[3],
-                )
+                h_qo, d_qk = q.shape[2], q.shape[3]
             else:
                 raise ValueError(f"Invalid query tensor shape: {q.shape}")
 
 
@@ -1791,6 +1791,18 @@ def run(
 
 
 class TrtllmGenDecodeModule:
+    def __init__(self) -> None:
+        self._sm_count: Optional[int] = None
+        self._mod = trtllm_gen_fmha_module()
+        self._op = self._mod.build_and_load()
+        from flashinfer.jit.cubin_loader import (
+            setup_cubin_loader,
+            setup_metainfo_loader,
+        )
+
+        setup_cubin_loader(self._mod.get_library_path())
+        setup_metainfo_loader(self._mod.get_library_path())
+
     def _paged_run(
         self,
         query: torch.Tensor,
@@ -1836,18 +1848,6 @@ def _paged_run(
     def _plan(self, *args, **kwargs):
         pass
 
-    def __init__(self):
-        self._sm_count: Optional[int] = None
-        self._mod = trtllm_gen_fmha_module()
-        self._op = self._mod.build_and_load()
-        from flashinfer.jit.cubin_loader import (
-            setup_cubin_loader,
-            setup_metainfo_loader,
-        )
-
-        setup_cubin_loader(self._mod.get_library_path())
-        setup_metainfo_loader(self._mod.get_library_path())
-
 
 @functools.cache
 def get_trtllm_gen_decode_module(*args):