revert

cyx-6 · cyx-6 · commit 3e8f69aaea57 · 2025-07-30T23:04:43.000Z
init

upd
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -33,7 +33,6 @@
     get_batch_prefill_uri,
     get_single_decode_uri,
     setup_cubin_loader,
-    setup_metainfo_loader,
     trtllm_gen_fmha_module,
 )
 from .page import get_seq_lens
@@ -304,7 +303,6 @@ def get_trtllm_gen_fmha_module():
     mod = trtllm_gen_fmha_module()
     op = mod.build_and_load()
     setup_cubin_loader(mod.get_library_path())
-    setup_metainfo_loader(mod.get_library_path())
     return op
 
 
@@ -1833,13 +1831,9 @@ def __init__(self):
         self._sm_count: Optional[int] = None
         self._mod = trtllm_gen_fmha_module()
         self._op = self._mod.build_and_load()
-        from flashinfer.jit.cubin_loader import (
-            setup_cubin_loader,
-            setup_metainfo_loader,
-        )
+        from flashinfer.jit.cubin_loader import setup_cubin_loader
 
         setup_cubin_loader(self._mod.get_library_path())
-        setup_metainfo_loader(self._mod.get_library_path())
 
 
 @functools.cache
diff --git a/flashinfer/fused_moe.py b/flashinfer/fused_moe.py
@@ -39,6 +39,7 @@
 from .jit import JitSpec
 from .jit import env as jit_env
 from .jit import gen_jit_spec, setup_cubin_loader, sm100a_nvcc_flags
+from .jit.cubin_loader import get_cubin
 from .utils import _check_shape_dtype_device, register_custom_op, register_fake_op
 
 
@@ -773,6 +774,18 @@ def cutlass_fused_moe(
 
 
 def trtllm_gen_fused_moe_sm100_module() -> JitSpec:
+    hash = "6b93c394210c89dccef13833c89797f1b8f8aefb"
+    tllm_gen_commit = "ce8ce46"
+    tllm_gen_config_hash = "2dc78d9"
+    include_path = (
+        f"{hash}/batched_gemm-{tllm_gen_commit}-{tllm_gen_config_hash}/include"
+    )
+    metainfo = get_cubin(
+        f"{include_path}/flashinferMetaInfo",
+        "b24fd5e7ae6b20e903c866ecb1d4a68f238301ba9b76df6a536056f2059a0d56",
+        ".h",
+    )
+    assert metainfo, "KernelMetaInfo.h not found"
     return gen_jit_spec(
         "fused_moe_sm100",
         [
@@ -788,8 +801,12 @@ def trtllm_gen_fused_moe_sm100_module() -> JitSpec:
             "-DENABLE_BF16",
             "-DENABLE_FP8",
             "-DENABLE_FP4",
+            f'-DPIPELINE_HASH=\\"{hash}\\"',
+            f'-DTLLM_GEN_COMMIT=\\"{tllm_gen_commit}\\"',
+            f'-DTLLM_GEN_BATCHED_GEMM_CONFIG_HASH=\\"{tllm_gen_config_hash}\\"',
         ]
         + sm100a_nvcc_flags,
+        extra_include_paths=[jit_env.FLASHINFER_CACHE_DIR / "cubins" / include_path],
         extra_ldflags=["-lcuda"],
     )
 
diff --git a/flashinfer/jit/__init__.py b/flashinfer/jit/__init__.py
@@ -68,7 +68,7 @@
 from .core import gen_jit_spec as gen_jit_spec
 from .core import sm90a_nvcc_flags as sm90a_nvcc_flags
 from .core import sm100a_nvcc_flags as sm100a_nvcc_flags
-from .cubin_loader import setup_cubin_loader, setup_metainfo_loader
+from .cubin_loader import setup_cubin_loader
 
 
 @functools.cache
diff --git a/flashinfer/jit/attention/pytorch.py b/flashinfer/jit/attention/pytorch.py
@@ -22,6 +22,7 @@
 
 from .. import env as jit_env
 from ..core import JitSpec, gen_jit_spec, logger, sm90a_nvcc_flags, sm100a_nvcc_flags
+from ..cubin_loader import get_cubin
 from ..utils import (
     dtype_map,
     filename_safe_dtype_map,
@@ -1487,13 +1488,23 @@ def gen_fmha_cutlass_sm100a_module(
 
 
 def trtllm_gen_fmha_module():
+    hash = "6b93c394210c89dccef13833c89797f1b8f8aefb"
+    include_path = f"{hash}/fmha/trtllm-gen/include"
+    metainfo = get_cubin(
+        f"{include_path}/flashInferMetaInfo",
+        "ba35dc13249cd09bf39eed43e785b088d329acaf81a3f940a615904b81bfa02f",
+        ".h",
+    )
+    assert metainfo, "flashInferMetaInfo.h not found"
     return gen_jit_spec(
         "fmha_gen",
         [
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_fmha_runner.cu",
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_fmha_kernel_launcher.cu",
         ],
+        extra_include_paths=[jit_env.FLASHINFER_CACHE_DIR / "cubins" / include_path],
         extra_ldflags=["-lcuda"],
+        extra_cuda_cflags=[f'-DPIPELINE_HASH=\\"{hash}\\"'],
     )
 
 
diff --git a/flashinfer/jit/cubin_loader.py b/flashinfer/jit/cubin_loader.py
@@ -188,32 +188,3 @@ def get_cubin_callback(name, sha256):
     dll_cubin_handlers[dll_path] = cb
 
     _LIB.FlashInferSetCubinCallback(cb)
-
-
-dll_metainfo_handlers = {}
-
-
-def setup_metainfo_loader(dll_path: str):
-    if dll_path in dll_metainfo_handlers:
-        return
-
-    _LIB = ctypes.CDLL(dll_path)
-
-    # Define the correct callback type
-    CALLBACK_TYPE = ctypes.CFUNCTYPE(
-        None, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p
-    )
-
-    def get_metainfo_callback(name, sha256, extension):
-        metainfo = get_cubin(
-            name.decode("utf-8"), sha256.decode("utf-8"), extension.decode("utf-8")
-        )
-        _LIB.FlashInferSetCurrentMetaInfo(
-            convert_to_ctypes_char_p(metainfo), ctypes.c_int(len(metainfo))
-        )
-
-    # Create the callback and keep a reference to prevent GC
-    cb = CALLBACK_TYPE(get_metainfo_callback)
-    dll_metainfo_handlers[dll_path] = cb
-
-    _LIB.FlashInferSetMetaInfoCallback(cb)
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -31,7 +31,6 @@
     get_batch_prefill_uri,
     get_single_prefill_uri,
     setup_cubin_loader,
-    setup_metainfo_loader,
     trtllm_gen_fmha_module,
 )
 from .page import block_sparse_indices_to_vector_sparse_offsets, get_seq_lens
@@ -92,7 +91,6 @@ def get_trtllm_gen_prefill_module():
     mod = trtllm_gen_fmha_module()
     op = mod.build_and_load()
     setup_cubin_loader(mod.get_library_path())
-    setup_metainfo_loader(mod.get_library_path())
 
     def _paged_run(
         query: torch.Tensor,
@@ -2946,7 +2944,6 @@ def get_trtllm_gen_fmha_module():
     mod = trtllm_gen_fmha_module()
     op = mod.build_and_load()
     setup_cubin_loader(mod.get_library_path())
-    setup_metainfo_loader(mod.get_library_path())
     return op
 
 
diff --git a/include/flashinfer/cubin_loader.h b/include/flashinfer/cubin_loader.h
@@ -56,31 +56,3 @@ std::string getCubin(const std::string& name, const std::string& sha256) {
   callbackGetCubin(name.c_str(), sha256.c_str());
   return current_cubin;
 }
-
-void (*callbackGetMetaInfo)(const char* path, const char* sha256, const char* extension) = nullptr;
-
-// Set the python callback, called by the python code using ctypes.
-extern "C" void FlashInferSetMetaInfoCallback(void (*callback)(const char* path, const char* sha256,
-                                                               const char* extension)) {
-  callbackGetMetaInfo = callback;
-}
-
-// Thread-local variable that stores the current metainfo.
-// It is reset on every call to `getMetaInfo()`.
-thread_local std::string raw_metainfo;
-
-// Called by the callback to set the current metainfo.
-extern "C" void FlashInferSetCurrentMetaInfo(const char* binary, int size) {
-  raw_metainfo = std::string(binary, size);
-}
-
-// Get the metainfo from the python callback.
-// This is the API for the native library to use.
-std::string getMetaInfo(const std::string& name, const std::string& sha256,
-                        const std::string& extension) {
-  if (!callbackGetMetaInfo) {
-    throw std::runtime_error("FlashInferSetMetaInfoCallback not set");
-  }
-  callbackGetMetaInfo(name.c_str(), sha256.c_str(), extension.c_str());
-  return raw_metainfo;
-}
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -24,7 +24,7 @@
 #include "trtllm/gen/CudaKernelLauncher.h"
 
 #ifdef TLLM_GEN_EXPORT_INTERFACE
-#include "KernelMetaInfo.h"
+#include "flashinferMetaInfo.h"
 #endif  // TLLM_GEN_EXPORT_INTERFACE
 
 namespace flashinfer::trtllm_cubin_loader {
@@ -466,7 +466,8 @@ BatchedGemmConfig const* BatchedGemmInterface::getBatchedGemmConfigs() const {
 
 size_t BatchedGemmInterface::getNumBatchedGemmConfigs() const {
 #ifdef TLLM_GEN_EXPORT_INTERFACE
-  return tensorrt_llm::kernels::tllmGenBatchedGemmListLen;
+  return sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList) /
+         sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList[0]);
 #else
   return 0;
 #endif
@@ -645,8 +646,7 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
 
   auto fiModuleLoadData = [&](CUmodule* module) {
     const std::string sha256 = config.mHash ? config.mHash : "";
-    const std::string pipeline_hash = "991e7438224199de85ef08a2730ce18c12b4e0aa";
-    const std::string cubin_path = pipeline_hash + "/" + std::string("batched_gemm-") +
+    const std::string cubin_path = std::string(PIPELINE_HASH) + "/" + std::string("batched_gemm-") +
                                    TLLM_GEN_COMMIT + "-" + TLLM_GEN_BATCHED_GEMM_CONFIG_HASH + "/";
     std::string fname_cubin = config.mFunctionName;
     if (!fname_cubin.empty()) {
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h
@@ -302,8 +302,6 @@ struct BatchedGemmConfig {
   // defined. In this case, the cubins will be loaded from the provided data and function name.
   // Otherwise, the kernel will be loaded from the CudaRunner.
 #ifdef TLLM_GEN_EXPORT_INTERFACE
-  uint8_t const* mData{nullptr};
-  uint32_t const mSize{0};
   uint32_t const mSharedMemSize{0};
   char const* mFunctionName{nullptr};
   uint32_t const mNumThreadsPerCTA{0};
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelMetaInfo.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelMetaInfo.h
diff --git a/include/flashinfer/trtllm/fmha/cubin/kernelMetaInfo.h b/include/flashinfer/trtllm/fmha/cubin/kernelMetaInfo.h
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh