[cutlass backend] Cache config generation locally and remotely (pytorch#154686)

henrylhtsang · pytorchmergebot · commit a4b0023f3b3c · 2025-05-30T05:40:46.000Z
Summary: Trying to cache the json list of configs. There are probably some more work: * preset * filelock (?) * for cases where we generate from scratch, save it to local as well (?) Test Plan: tested offline Reviewed By: coconutruben Differential Revision: D75334439 Pull Request resolved: pytorch#154686 Approved by: https://github.com/coconutruben, https://github.com/ColinPeppler
diff --git a/torch/_inductor/codegen/cuda/cutlass_cache.py b/torch/_inductor/codegen/cuda/cutlass_cache.py
@@ -0,0 +1,93 @@
+# mypy: allow-untyped-defs
+import functools
+import hashlib
+import json
+import logging
+import os
+import time
+from typing import Any, Optional
+
+import torch._inductor.config as config
+from torch._inductor.codecache import cutlass_key
+from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch, get_cuda_version
+from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
+from torch._inductor.runtime.cache_dir_utils import cache_dir
+from torch._inductor.utils import clear_on_fresh_inductor_cache
+
+
+log = logging.getLogger(__name__)
+
+
+CONFIG_PREFIX: str = "configs"
+
+
+def get_config_request_key(
+    arch: str,
+    cuda_version: str,
+    instantiation_level: str,
+) -> str:
+    """
+    Return a key for the full ops, based on cutlass key, arch, cuda version, and instantiation level.
+    """
+    hash_target = "-".join(
+        [
+            cutlass_key().decode(),
+            arch,
+            cuda_version,
+            instantiation_level,
+        ]
+    )
+    return hashlib.sha256(hash_target.encode("utf-8")).hexdigest()[0:8]
+
+
+def _generate_config_filename(request_key: str) -> str:
+    """
+    Generate a filename for the full ops.
+    """
+    return f"{CONFIG_PREFIX}_{request_key}.json"
+
+
+@clear_on_fresh_inductor_cache
+@functools.lru_cache(None)
+def maybe_fetch_ops() -> Optional[list[Any]]:
+    """
+    Fetch ops from databases.
+    """
+    if config.force_disable_caches:
+        return None
+
+    # setup
+    arch: str = get_cuda_arch()
+    # get_cuda_version might return "12.4.0" or "12.4"
+    # but we want to use "12.4"
+    version: str = ".".join(get_cuda_version().split(".")[:2])
+    instantiation_level: str = config.cuda.cutlass_instantiation_level
+
+    # filename and filepath
+    request_key: str = get_config_request_key(arch, version, instantiation_level)
+    filename: str = _generate_config_filename(request_key)
+    filepath: str = os.path.join(cache_dir(), filename)
+
+    # try fetch
+    serialized_ops: Optional[list[str]] = None
+    start_time = time.time()
+    if os.path.isfile(filepath):
+        # locally
+        with open(filepath) as f:
+            serialized_ops = json.load(f)
+    elif config.is_fbcode():
+        from torch._inductor.fb.cutlass_remote_cache import (
+            maybe_fetch_cutlass_configs_from_remote,
+        )
+
+        # from remote
+        serialized_ops = maybe_fetch_cutlass_configs_from_remote(filepath)
+
+    if serialized_ops is None:
+        return None
+
+    # deserialize
+    serializer = get_cutlass_operation_serializer()
+    full_ops = [serializer.deserialize(x) for x in serialized_ops]  # type: ignore[union-attr]
+    log.info("Loaded ops from %s cache in %.3fs", filename, time.time() - start_time)
+    return full_ops
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -10,6 +10,7 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch._inductor.codegen.cuda.cutlass_cache import maybe_fetch_ops
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.select_algorithm import create_inputs_key
 from torch._inductor.utils import clear_on_fresh_inductor_cache
@@ -930,8 +931,14 @@ def gen_ops(self) -> "list[tuple[str, cutlass_gemm_op.GemmOperation]]":  # type:
             log.debug("Using cached ops for %s", self.cache_key)
             return self.filtered_ops_cache[self.cache_key]
 
-        full_ops = cutlass_utils.gen_ops()
-        ops = pytree.tree_flatten(full_ops)[0]
+        maybe_ops = maybe_fetch_ops()
+        if maybe_ops is None:
+            log.debug("Cannot fetch ops from cache, generating ops from scratch")
+            full_ops = cutlass_utils.gen_ops()
+            ops = pytree.tree_flatten(full_ops)[0]
+        else:
+            log.debug("Using cached ops from cache")
+            ops = maybe_ops
 
         res: dict[str, cutlass_gemm_op.GemmOperation] = {}
         start_time = time.time()
diff --git a/torch/_inductor/codegen/cuda/serialization.py b/torch/_inductor/codegen/cuda/serialization.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import enum
+import functools
 import json
 from enum import Enum
 from typing import Optional
@@ -458,6 +459,7 @@ def _json_to_enum(cls, json_dict, enum_class):
         return enum_class[json_dict["name"]]
 
 
+@functools.lru_cache(1)
 def get_cutlass_operation_serializer() -> Optional[CUTLASSOperationSerializer]:
     if not try_import_cutlass():
         return None