Reland of "[ROCm] change preferred blas lib defaults (pytorch#150249)"" (pytorch#150707)

atalman · web-flow · commit 35f1e7621254 · 2025-04-04T19:34:45.000-04:00
Revert "Revert "[ROCm] change preferred blas lib defaults (pytorch#150249)" (pytorch#150658)" This reverts commit 06c6a81.
diff --git a/aten/src/ATen/BlasBackend.h b/aten/src/ATen/BlasBackend.h
@@ -7,10 +7,12 @@
 
 namespace at {
 
-enum class BlasBackend : int8_t { Cublas, Cublaslt, Ck };
+enum class BlasBackend : int8_t { Default, Cublas, Cublaslt, Ck };
 
 inline std::string BlasBackendToString(at::BlasBackend backend) {
   switch (backend) {
+    case BlasBackend::Default:
+      return "at::BlasBackend::Default";
     case BlasBackend::Cublas:
       return "at::BlasBackend::Cublas";
     case BlasBackend::Cublaslt:
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -326,7 +326,34 @@ void Context::setLinalgPreferredBackend(at::LinalgBackend b) {
 }
 
 at::BlasBackend Context::blasPreferredBackend() {
+  // Rather than put logic for interpreting what Default means at every
+  // call site for blasPreferredBackend(), we set it to an actual value.
+  if (blas_preferred_backend == at::BlasBackend::Default) {
+    blas_preferred_backend = at::BlasBackend::Cublas;
 #ifdef USE_ROCM
+    // AMD Instinct targets prefer hipblaslt
+    static const bool hipblaslt_preferred = []() {
+      static const std::vector<std::string> archs = {
+          "gfx90a", "gfx942",
+#if ROCM_VERSION >= 60500
+          "gfx950"
+#endif
+      };
+      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
+        if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+          return false;
+        }
+      }
+      return true;
+    }();
+    if (hipblaslt_preferred) {
+      blas_preferred_backend = at::BlasBackend::Cublaslt;
+    }
+#endif
+  }
+
+#ifdef USE_ROCM
+  // hipblaslt support for all archs is not as complete as hipblas
   if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
     static const bool hipblaslt_unsupported = []() {
       static const std::vector<std::string> archs = {
@@ -338,7 +365,7 @@ at::BlasBackend Context::blasPreferredBackend() {
           "gfx950"
 #endif
       };
-      for (auto index: c10::irange(getNumGPUs())) {
+      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
         if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
           TORCH_WARN_ONCE(
             "Attempting to use hipBLASLt on an unsupported architecture! "
@@ -365,7 +392,7 @@ void Context::setBlasPreferredBackend(at::BlasBackend b) {
       "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
   TORCH_CHECK((b != at::BlasBackend::Ck) || hasROCM(),
       "Cannot set preferred backend to Ck if PyTorch has not been compiled for ROCm.");
-  if (b != at::BlasBackend::Cublas) {
+  if (b != at::BlasBackend::Default && b != at::BlasBackend::Cublas) {
     TORCH_WARN_ONCE(
       "torch.backends.cuda.preferred_blas_library is an experimental feature. "
       "If you see any error or unexpected behavior when this flag is set "
@@ -391,7 +418,7 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
       static const std::vector<std::string> archs = {
           "gfx90a",  "gfx942"
       };
-      for (auto index: c10::irange(getNumGPUs())) {
+      for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
         if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
           TORCH_WARN_ONCE(
             "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -446,17 +446,15 @@ class TORCH_API Context {
   bool allow_tf32_onednn = false;
   bool enabled_nnpack = true;
   at::LinalgBackend linalg_preferred_backend =
-      c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true
+      (c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true ||
+       c10::utils::check_env("TORCH_LINALG_PREFER_HIPSOLVER") == true) // alias
       ? at::LinalgBackend::Cusolver
       : at::LinalgBackend::Default;
   at::BlasBackend blas_preferred_backend =
-#ifdef USE_ROCM
-      (c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") != false)
-#else
-      (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true)
-#endif
+      (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true ||
+       c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") == true) // alias
       ? at::BlasBackend::Cublaslt
-      : at::BlasBackend::Cublas;
+      : at::BlasBackend::Default;
   at::ROCmFABackend rocm_fa_preferred_backend =
       c10::utils::check_env("TORCH_ROCM_FA_PREFER_CK") == true
       ? at::ROCmFABackend::Ck
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -586,6 +586,64 @@ def test_serialization_array_with_storage(self):
         q_copy[1].fill_(10)
         self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
 
+    @setBlasBackendsToDefaultFinally
+    def test_preferred_blas_library_settings(self):
+        def _check_default():
+            default = torch.backends.cuda.preferred_blas_library()
+            if torch.version.cuda:
+                # CUDA logic is easy, it's always cublas
+                self.assertTrue(default == torch._C._BlasBackend.Cublas)
+            else:
+                # ROCm logic is less so, it's cublaslt for some Instinct, cublas for all else
+                gcn_arch = str(
+                    torch.cuda.get_device_properties(0).gcnArchName.split(":", 1)[0]
+                )
+                if gcn_arch in ["gfx90a", "gfx942", "gfx950"]:
+                    self.assertTrue(default == torch._C._BlasBackend.Cublaslt)
+                else:
+                    self.assertTrue(default == torch._C._BlasBackend.Cublas)
+
+        _check_default()
+        # "Default" can be set but is immediately reset internally to the actual default value.
+        self.assertTrue(
+            torch.backends.cuda.preferred_blas_library("default")
+            != torch._C._BlasBackend.Default
+        )
+        _check_default()
+        self.assertTrue(
+            torch.backends.cuda.preferred_blas_library("cublas")
+            == torch._C._BlasBackend.Cublas
+        )
+        self.assertTrue(
+            torch.backends.cuda.preferred_blas_library("hipblas")
+            == torch._C._BlasBackend.Cublas
+        )
+        # check bad strings
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Unknown input value. Choose from: default, cublas, hipblas, cublaslt, hipblaslt, ck.",
+        ):
+            torch.backends.cuda.preferred_blas_library("unknown")
+        # check bad input type
+        with self.assertRaisesRegex(RuntimeError, "Unknown input value type."):
+            torch.backends.cuda.preferred_blas_library(1.0)
+        # check env var override
+        custom_envs = [
+            {"TORCH_BLAS_PREFER_CUBLASLT": "1"},
+            {"TORCH_BLAS_PREFER_HIPBLASLT": "1"},
+        ]
+        test_script = "import torch;print(torch.backends.cuda.preferred_blas_library())"
+        for env_config in custom_envs:
+            env = os.environ.copy()
+            for key, value in env_config.items():
+                env[key] = value
+            r = (
+                subprocess.check_output([sys.executable, "-c", test_script], env=env)
+                .decode("ascii")
+                .strip()
+            )
+            self.assertEqual("_BlasBackend.Cublaslt", r)
+
     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "temporarily disabled for async")
     @setBlasBackendsToDefaultFinally
     def test_cublas_workspace_explicit_allocation(self):
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -1309,6 +1309,7 @@ def _get_blas_preferred_backend() -> torch._C._BlasBackend: ...
 def _set_blas_preferred_backend(arg: torch._C._BlasBackend): ...
 
 class _BlasBackend:
+    Default: _BlasBackend
     Cublas: _BlasBackend
     Cublaslt: _BlasBackend
     Ck: _BlasBackend
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
@@ -218,7 +218,9 @@ def preferred_linalg_library(
 
 
 _BlasBackends = {
+    "default": torch._C._BlasBackend.Default,
     "cublas": torch._C._BlasBackend.Cublas,
+    "hipblas": torch._C._BlasBackend.Cublas,  # alias
     "cublaslt": torch._C._BlasBackend.Cublaslt,
     "hipblaslt": torch._C._BlasBackend.Cublaslt,  # alias
     "ck": torch._C._BlasBackend.Ck,
@@ -241,6 +243,7 @@ def preferred_blas_library(
     * If `"cublas"` is set then cuBLAS will be used wherever possible.
     * If `"cublaslt"` is set then cuBLASLt will be used wherever possible.
     * If `"ck"` is set then CK will be used wherever possible.
+    * If `"default"` (the default) is set then heuristics will be used to pick between the other options.
     * When no input is given, this function returns the currently preferred library.
     * User may use the environment variable TORCH_BLAS_PREFER_CUBLASLT=1 to set the preferred library to cuBLASLt
       globally.
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -2243,6 +2243,7 @@ Call this whenever a new thread is created in order to propagate values from
   });
 
   py::enum_<at::BlasBackend>(py_module, "_BlasBackend")
+      .value("Default", at::BlasBackend::Default)
       .value("Cublas", at::BlasBackend::Cublas)
       .value("Cublaslt", at::BlasBackend::Cublaslt)
       .value("Ck", at::BlasBackend::Ck);