misc: Artifact downloading and single sourced artifact path (#1369)

cyx-6 · yzh119 · web-flow · commit d517373c2f81 · 2025-08-03T06:34:49.000-07:00
## 📌 Description This PR adds the features of downloading complete artifacts and makes artifacts path single sourced in python. cc: @yyihuang @zhyncs  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: Yaxing Cai <yaxingc@nvidia.com> Co-authored-by: Zihao Ye <expye@outlook.com>
diff --git a/csrc/cudnn_sdpa_kernel_launcher.cu b/csrc/cudnn_sdpa_kernel_launcher.cu
@@ -29,6 +29,12 @@
 #include "cudnn_sdpa_utils.h"
 #include "pytorch_extension_utils.h"
 
+#ifdef CUDNN_SDPA_CUBIN_PATH
+static const std::string cudnn_sdpa_cubin_path = std::string(CUDNN_SDPA_CUBIN_PATH);
+#else
+static_assert(false, "CUDNN_SDPA_CUBIN_PATH macro is not defined when compiling");
+#endif
+
 namespace flashinfer {
 
 namespace cudnn_sdpa_kernel_launcher {
@@ -77,19 +83,15 @@ enum PrefillType {
 };
 
 void init_cudnn_cubin(std::map<KernelType, std::string>& cubin_map) {
-  cubin_map[PREFILL] = getCubin(
-      "4c623163877c8fef5751c9c7a59940cd2baae02e/fmha/cudnn/"
-      "cudnn_sm100_fprop_sdpa_prefill_d128_bf16",
-      "ff14e8dcfc04d9b3a912dd44056be37d9aa8a85976e0070494ca0cce0524f2a1");
-
-  cubin_map[DECODE] = getCubin(
-      "4c623163877c8fef5751c9c7a59940cd2baae02e/fmha/cudnn/cudnn_sm100_fprop_sdpa_decode_d128_bf16",
-      "e7ce0408b4c3a36c42616498228534ee64cab785ef570af5741deaf9dd1b475c");
-
-  cubin_map[PREFILL_DEEPSEEK] = getCubin(
-      "4c623163877c8fef5751c9c7a59940cd2baae02e/fmha/cudnn/"
-      "cudnn_sm100_fprop_sdpa_prefill_d192_bf16",
-      "2190967b8733e193cdcecc054eeb7c2907080a158a33fe7ba2004523a4aff6f9");
+  cubin_map[PREFILL] = getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_prefill_d128_bf16",
+                                "ff14e8dcfc04d9b3a912dd44056be37d9aa8a85976e0070494ca0cce0524f2a1");
+
+  cubin_map[DECODE] = getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_decode_d128_bf16",
+                               "e7ce0408b4c3a36c42616498228534ee64cab785ef570af5741deaf9dd1b475c");
+
+  cubin_map[PREFILL_DEEPSEEK] =
+      getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_prefill_d192_bf16",
+               "2190967b8733e193cdcecc054eeb7c2907080a158a33fe7ba2004523a4aff6f9");
 }
 
 auto get_cudnn_cubin(KernelType kernel_type) -> std::string {
diff --git a/flashinfer/__main__.py b/flashinfer/__main__.py
@@ -0,0 +1,34 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+# flashinfer-cli
+import argparse
+
+from .artifacts import download_artifacts
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("FlashInfer CLI")
+    parser.add_argument(
+        "--download-cubin", action="store_true", help="Download artifacts"
+    )
+
+    args = parser.parse_args()
+
+    if args.download_cubin:
+        if download_artifacts():
+            print("✅ All cubin download tasks completed successfully.")
+        else:
+            print("❌ Some cubin download tasks failed.")
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -0,0 +1,100 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import requests
+
+from .jit.core import logger
+from .jit.cubin_loader import FLASHINFER_CUBINS_REPOSITORY, get_cubin
+
+
+def get_available_cubin_files(source, retries=3, delay=5, timeout=10):
+    for attempt in range(1, retries + 1):
+        try:
+            response = requests.get(source, timeout=timeout)
+            response.raise_for_status()
+            hrefs = re.findall(r'\<a href=".*\.cubin">', response.text)
+            files = [(h[9:-8], ".cubin") for h in hrefs]
+            return files
+
+        except requests.exceptions.RequestException as e:
+            logger.warning(
+                f"Fetching available files {source}: attempt {attempt} failed: {e}"
+            )
+
+            if attempt < retries:
+                logger.info(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                logger.error("Max retries reached. Fetch failed.")
+                return []
+
+
+class ArtifactPath:
+    TRTLLM_GEN_FMHA: str = "52e676342c67a3772e06f10b84600044c0c22b76/fmha/trtllm-gen/"
+    TRTLLM_GEN_BMM: str = (
+        "991e7438224199de85ef08a2730ce18c12b4e0aa/batched_gemm-c603ed2-2dc78d9/"
+    )
+    TRTLLM_GEN_GEMM: str = (
+        "fffd607babb0844f24225997409747ca38229333/gemm-c603ed2-f2b0c24/"
+    )
+    CUDNN_SDPA: str = "4c623163877c8fef5751c9c7a59940cd2baae02e/fmha/cudnn/"
+    DEEPGEMM: str = "d25901733420c7cddc1adf799b0d4639ed1e162f/deep-gemm/"
+
+
+class MetaInfoHash:
+    TRTLLM_GEN_FMHA: str = (
+        "8c5630020c0452fb1cd1ea7e3b8fdbb7bf94f71bd899ed5b704a490bdb4f7368"
+    )
+    DEEPGEMM: str = "69aa277b7f3663ed929e73f9c57301792b8c594dac15a465b44a5d151b6a1d50"
+
+
+def download_artifacts() -> bool:
+    env_backup = os.environ.get("FLASHINFER_CUBIN_CHECKSUM_DISABLED", None)
+    os.environ["FLASHINFER_CUBIN_CHECKSUM_DISABLED"] = "1"
+    cubin_files = [(ArtifactPath.TRTLLM_GEN_FMHA + "flashInferMetaInfo", ".h")]
+    for kernel in [
+        ArtifactPath.TRTLLM_GEN_FMHA,
+        ArtifactPath.TRTLLM_GEN_BMM,
+        ArtifactPath.TRTLLM_GEN_GEMM,
+        ArtifactPath.DEEPGEMM,
+    ]:
+        cubin_files += [
+            (kernel + name, extension)
+            for name, extension in get_available_cubin_files(
+                FLASHINFER_CUBINS_REPOSITORY + "/" + kernel
+            )
+        ]
+    pool = ThreadPoolExecutor(4)
+    futures = []
+    for name, extension in cubin_files:
+        ret = pool.submit(get_cubin, name, "", extension)
+        futures.append(ret)
+    results = []
+    for ret in as_completed(futures):
+        result = ret.result()
+        results.append(result)
+    all_success = all(results)
+    if not env_backup:
+        os.environ.pop("FLASHINFER_CUBIN_CHECKSUM_DISABLED")
+    else:
+        os.environ["FLASHINFER_CUBIN_CHECKSUM_DISABLED"] = env_backup
+
+    return all_success
diff --git a/flashinfer/deep_gemm.py b/flashinfer/deep_gemm.py
@@ -35,6 +35,7 @@
 import cuda.bindings.driver as cbd
 import torch
 
+from .artifacts import ArtifactPath, MetaInfoHash
 from .cuda_utils import checkCudaErrors
 from .jit.cubin_loader import get_cubin
 from .jit.env import FLASHINFER_CACHE_DIR
@@ -887,17 +888,17 @@ def launch(kernel: cbd.CUkernel, kwargs: Dict[str, Any]) -> cbd.CUresult:
         return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0)
 
 
-_artifact_hash = "d25901733420c7cddc1adf799b0d4639ed1e162f"
-
-
 def load_all():
     for cubin_name in KERNEL_MAP:
         if cubin_name in RUNTIME_CACHE:
             continue
         symbol, sha256 = KERNEL_MAP[cubin_name]
-        cubin_prefix = f"{_artifact_hash}/deep-gemm/"
-        get_cubin(cubin_prefix + cubin_name, sha256)
-        path = FLASHINFER_CACHE_DIR / "cubins" / f"{cubin_prefix + cubin_name}.cubin"
+        get_cubin(ArtifactPath.DEEPGEMM + cubin_name, sha256)
+        path = (
+            FLASHINFER_CACHE_DIR
+            / "cubins"
+            / f"{ArtifactPath.DEEPGEMM + cubin_name}.cubin"
+        )
         assert path.exists()
         RUNTIME_CACHE[cubin_name] = SM100FP8GemmRuntime(str(path), symbol)
 
@@ -910,9 +911,10 @@ def load(name: str, code: str) -> SM100FP8GemmRuntime:
     if cubin_name in RUNTIME_CACHE:
         return RUNTIME_CACHE[cubin_name]
     symbol, sha256 = KERNEL_MAP[cubin_name]
-    cubin_prefix = f"{_artifact_hash}/deep-gemm/"
-    get_cubin(cubin_prefix + cubin_name, sha256)
-    path = FLASHINFER_CACHE_DIR / "cubins" / f"{cubin_prefix + cubin_name}.cubin"
+    get_cubin(ArtifactPath.DEEPGEMM + cubin_name, sha256)
+    path = (
+        FLASHINFER_CACHE_DIR / "cubins" / f"{ArtifactPath.DEEPGEMM + cubin_name}.cubin"
+    )
     assert path.exists()
     RUNTIME_CACHE[cubin_name] = SM100FP8GemmRuntime(str(path), symbol)
     return RUNTIME_CACHE[cubin_name]
@@ -1436,8 +1438,7 @@ def __init__(self, sha256: str):
         self.indice = None
 
     def init_indices(self):
-        cubin_prefix = f"{_artifact_hash}/deep-gemm/"
-        indice_path = cubin_prefix + "kernel_map"
+        indice_path = ArtifactPath.DEEPGEMM + "kernel_map"
         assert get_cubin(
             indice_path, self.sha256, file_extension=".json"
         ), "cubin kernel map file not found, nor downloaded with matched sha256"
@@ -1458,6 +1459,4 @@ def __getitem__(self, key):
         return self.indice[key]
 
 
-KERNEL_MAP = KernelMap(
-    "69aa277b7f3663ed929e73f9c57301792b8c594dac15a465b44a5d151b6a1d50"
-)
+KERNEL_MAP = KernelMap(MetaInfoHash.DEEPGEMM)
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -21,6 +21,7 @@
 
 import torch
 
+from ..artifacts import ArtifactPath
 from ..autotuner import (
     AutoTuner,
     ConstraintSpec,
@@ -712,6 +713,7 @@ def trtllm_gen_fused_moe_sm100_module() -> JitSpec:
             "-DENABLE_BF16",
             "-DENABLE_FP8",
             "-DENABLE_FP4",
+            f'-DTLLM_GEN_BMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_BMM}\\"',
         ]
         + sm100a_nvcc_flags,
         extra_ldflags=["-lcuda"],
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -25,6 +25,7 @@
 import torch
 import torch.nn.functional as F
 
+from .artifacts import ArtifactPath
 from .autotuner import (
     AutoTuner,
     ConstraintSpec,
@@ -309,6 +310,7 @@ def trtllm_gemm_gen_module() -> JitSpec:
         extra_cuda_cflags=[
             "-DTLLM_GEN_EXPORT_INTERFACE",
             "-DTLLM_ENABLE_CUDA",
+            f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"',
         ]
         + sm100a_nvcc_flags,
         extra_ldflags=["-lcuda"],
diff --git a/flashinfer/jit/attention/pytorch.py b/flashinfer/jit/attention/pytorch.py
@@ -20,6 +20,7 @@
 import jinja2
 import torch
 
+from ...artifacts import ArtifactPath, MetaInfoHash
 from .. import env as jit_env
 from ..core import JitSpec, gen_jit_spec, logger, sm90a_nvcc_flags, sm100a_nvcc_flags
 from ..utils import (
@@ -1494,6 +1495,10 @@ def trtllm_gen_fmha_module():
             jit_env.FLASHINFER_CSRC_DIR / "trtllm_fmha_kernel_launcher.cu",
         ],
         extra_ldflags=["-lcuda"],
+        extra_cuda_cflags=[
+            f'-DTLLM_GEN_FMHA_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_FMHA}\\"',
+            f'-DTLLM_GEN_FMHA_METAINFO_HASH=\\"{MetaInfoHash.TRTLLM_GEN_FMHA}\\"',
+        ],
     )
 
 
@@ -1593,4 +1598,7 @@ def cudnn_fmha_gen_module():
         "fmha_cudnn_gen",
         [jit_env.FLASHINFER_CSRC_DIR / "cudnn_sdpa_kernel_launcher.cu"],
         extra_ldflags=["-lcuda"],
+        extra_cuda_cflags=[
+            f'-DCUDNN_SDPA_CUBIN_PATH=\\"{ArtifactPath.CUDNN_SDPA}\\"',
+        ],
     )
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -27,6 +27,12 @@
 #include "KernelMetaInfo.h"
 #endif  // TLLM_GEN_EXPORT_INTERFACE
 
+#ifdef TLLM_GEN_BMM_CUBIN_PATH
+static const std::string tllm_gen_bmm_cubin_path = std::string(TLLM_GEN_BMM_CUBIN_PATH);
+#else
+static_assert(false, "TLLM_GEN_BMM_CUBIN_PATH macro is not defined when compiling");
+#endif
+
 namespace flashinfer::trtllm_cubin_loader {
 std::string getCubin(const std::string& kernelName, const std::string& sha256);
 }
@@ -645,14 +651,11 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
 
   auto fiModuleLoadData = [&](CUmodule* module) {
     const std::string sha256 = config.mHash ? config.mHash : "";
-    const std::string pipeline_hash = "991e7438224199de85ef08a2730ce18c12b4e0aa";
-    const std::string cubin_path = pipeline_hash + "/" + std::string("batched_gemm-") +
-                                   TLLM_GEN_COMMIT + "-" + TLLM_GEN_BATCHED_GEMM_CONFIG_HASH + "/";
     std::string fname_cubin = config.mFunctionName;
     if (!fname_cubin.empty()) {
       fname_cubin[0] = static_cast<char>(std::toupper(static_cast<unsigned char>(fname_cubin[0])));
     }
-    fname_cubin = cubin_path + fname_cubin;
+    fname_cubin = tllm_gen_bmm_cubin_path + fname_cubin;
     std::string cubin = flashinfer::trtllm_cubin_loader::getCubin(fname_cubin, sha256);
     cuModuleLoadData(&cuModule, cubin.c_str());
   };
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -31,6 +31,18 @@
 #include "fmhaRunnerParams.h"
 #include "kernelParams.h"
 
+#ifdef TLLM_GEN_FMHA_CUBIN_PATH
+static const std::string tllm_gen_fmha_cubin_path = std::string(TLLM_GEN_FMHA_CUBIN_PATH);
+#else
+static_assert(false, "TLLM_GEN_FMHA_CUBIN_PATH macro is not defined when compiling");
+#endif
+
+#ifdef TLLM_GEN_FMHA_METAINFO_HASH
+static const std::string tllm_gen_fmha_metainfo_hash = std::string(TLLM_GEN_FMHA_METAINFO_HASH);
+#else
+static_assert(false, "TLLM_GEN_FMHA_METAINFO_HASH macro is not defined when compiling");
+#endif
+
 namespace flashinfer::trtllm_cubin_loader {
 std::string getCubin(const std::string& kernelName, const std::string& sha256);
 std::string getMetaInfo(const std::string& name, const std::string& sha256,
@@ -234,14 +246,6 @@ class TllmGenFmhaKernel {
     }
   }
 
-  static std::string getCubinPath() {
-    const char* env_hash = std::getenv("FLASHINFER_CUBIN_ARTIFACTORY_HASH");
-    std::string hash =
-        env_hash ? std::string(env_hash) : "52e676342c67a3772e06f10b84600044c0c22b76";
-    std::string cubin_path = hash + "/fmha/trtllm-gen/";
-    return cubin_path;
-  }
-
  private:
   // Is it MLA generation kernel ?
   inline bool isMlaGenKernel(RunnerParams const& params) const {
@@ -539,7 +543,7 @@ class TllmGenFmhaKernel {
     };
     if (findModuleIter == mModules.end()) {
       // Load the module.
-      std::string cubin_path = TllmGenFmhaKernel::getCubinPath() + kernelMeta.mFuncName;
+      std::string cubin_path = tllm_gen_fmha_cubin_path + kernelMeta.mFuncName;
       std::string cubin = getCubin(cubin_path, kernelMeta.sha256);
       if (cubin.empty()) {
         throw std::runtime_error("Failed to load cubin for " + kernelName);
@@ -593,9 +597,8 @@ class TllmFmhaKernelFactory {
     std::lock_guard<std::mutex> lg(s_mutex);
 
     if (!metainfo_loaded) {
-      std::string metainfo_raw =
-          getMetaInfo(TllmGenFmhaKernel::getCubinPath() + "flashInferMetaInfo",
-                      "8c5630020c0452fb1cd1ea7e3b8fdbb7bf94f71bd899ed5b704a490bdb4f7368", ".h");
+      std::string metainfo_raw = getMetaInfo(tllm_gen_fmha_cubin_path + "flashInferMetaInfo",
+                                             tllm_gen_fmha_metainfo_hash, ".h");
       metainfo = KernelType::KernelMeta::loadFromMetaInfoRaw(metainfo_raw);
       metainfo_loaded = true;
     }
diff --git a/include/flashinfer/trtllm/gemm/trtllmGen_gemm_export/GemmInterface.h b/include/flashinfer/trtllm/gemm/trtllmGen_gemm_export/GemmInterface.h