move module generation function to jit module

hypdeb · hypdeb · commit 25abf2024f9f · 2025-10-04T10:50:19.000Z
diff --git a/flashinfer/jit/gemm/core.py b/flashinfer/jit/gemm/core.py
@@ -493,3 +493,31 @@ def gen_gemm_sm90_module() -> JitSpec:
         source_paths,
         extra_cuda_cflags=sm90a_nvcc_flags,
     )
+
+
+def gen_trtllm_low_latency_gemm_module() -> JitSpec:
+    include_path = f"{ArtifactPath.TRTLLM_GEN_GEMM}/include"
+    header_name = "flashinferMetaInfo"
+
+    # use `get_cubin` to get "flashinferMetaInfo.h"
+    metainfo = get_cubin(
+        f"{include_path}/{header_name}.h",
+        MetaInfoHash.TRTLLM_GEN_GEMM,
+    )
+    # make sure "flashinferMetaInfo.h" is downloaded or cached
+    assert metainfo, f"{header_name}.h not found"
+    return gen_jit_spec(
+        "trtllm_gemm",
+        [
+            jit_env.FLASHINFER_CSRC_DIR / "trtllm_low_latency_gemm_runner.cu",
+        ],
+        extra_cuda_cflags=[
+            "-DTLLM_GEN_EXPORT_INTERFACE",
+            "-DTLLM_ENABLE_CUDA",
+            f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"',
+        ]
+        + sm100a_nvcc_flags,
+        # link "include" sub-directory in cache
+        extra_include_paths=[jit_env.FLASHINFER_CUBIN_DIR / include_path],
+        extra_ldflags=["-lcuda"],
+    )
diff --git a/flashinfer/trtllm_low_latency_gemm.py b/flashinfer/trtllm_low_latency_gemm.py
@@ -23,9 +23,9 @@
     convert_to_block_layout,
     get_w2_permute_indices_with_cache,
 )
+from flashinfer.jit.gemm.core import gen_trtllm_low_latency_gemm_module
 import torch
 
-from flashinfer.artifacts import ArtifactPath, MetaInfoHash
 from flashinfer.autotuner import (
     AutoTuner,
     TuningConfig,
@@ -38,40 +38,10 @@
     get_last_power_of_2_num_tokens_buckets,
     last_positive_power_of_2,
 )
-from flashinfer.jit import setup_cubin_loader, JitSpec, gen_jit_spec, sm100a_nvcc_flags
-from flashinfer.jit import env as jit_env
-from flashinfer.jit.cubin_loader import get_cubin
+from flashinfer.jit import setup_cubin_loader
 from flashinfer.utils import _get_cache_buf
 
 
-def gen_trtllm_low_latency_gemm_module() -> JitSpec:
-    include_path = f"{ArtifactPath.TRTLLM_GEN_GEMM}/include"
-    header_name = "flashinferMetaInfo"
-
-    # use `get_cubin` to get "flashinferMetaInfo.h"
-    metainfo = get_cubin(
-        f"{include_path}/{header_name}.h",
-        MetaInfoHash.TRTLLM_GEN_GEMM,
-    )
-    # make sure "flashinferMetaInfo.h" is downloaded or cached
-    assert metainfo, f"{header_name}.h not found"
-    return gen_jit_spec(
-        "trtllm_gemm",
-        [
-            jit_env.FLASHINFER_CSRC_DIR / "trtllm_low_latency_gemm_runner.cu",
-        ],
-        extra_cuda_cflags=[
-            "-DTLLM_GEN_EXPORT_INTERFACE",
-            "-DTLLM_ENABLE_CUDA",
-            f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"',
-        ]
-        + sm100a_nvcc_flags,
-        # link "include" sub-directory in cache
-        extra_include_paths=[jit_env.FLASHINFER_CUBIN_DIR / include_path],
-        extra_ldflags=["-lcuda"],
-    )
-
-
 @functools.cache
 def get_trtllm_low_latency_gemm_module():
     mod = gen_trtllm_low_latency_gemm_module()