setup.py: add compile flags for bf16 and fp8.

fanshiqing · fanshiqing · commit 8040918a435b · 2024-04-25T06:25:15.000-07:00
diff --git a/csrc/permute.cu b/csrc/permute.cu
@@ -523,7 +523,7 @@ std::tuple<torch::Tensor, torch::Tensor, std::vector<Tensor>> moe_permute_topK_o
 
         break;
     }
-// #ifdef ENABLE_BF16
+#ifdef ENABLE_BF16
     case at::ScalarType::BFloat16:
     {
         using dType = cutlass::bfloat16_t;
@@ -545,8 +545,8 @@ std::tuple<torch::Tensor, torch::Tensor, std::vector<Tensor>> moe_permute_topK_o
 
         break;
     }
-// #endif
-// #ifdef ENABLE_FP8
+#endif
+#ifdef ENABLE_FP8
     case at::ScalarType::Float8_e5m2:
     {
         using dType = cutlass::float_e5m2_t;
@@ -589,7 +589,7 @@ std::tuple<torch::Tensor, torch::Tensor, std::vector<Tensor>> moe_permute_topK_o
 
         break;
     }
-// #endif
+#endif
     default:
         throw std::runtime_error("Wrong activation tensor type.");
     }
@@ -670,7 +670,7 @@ torch::Tensor moe_recover_topK_op(
 
         break;
     }
-// #ifdef ENABLE_BF16
+#ifdef ENABLE_BF16
     case at::ScalarType::BFloat16:
     {
         using dType = cutlass::bfloat16_t;
@@ -692,8 +692,8 @@ torch::Tensor moe_recover_topK_op(
 
         break;
     }
-// #endif
-// #ifdef ENABLE_FP8
+#endif
+#ifdef ENABLE_FP8
     case at::ScalarType::Float8_e5m2:
     {
         using dType = cutlass::float_e5m2_t;
@@ -736,7 +736,7 @@ torch::Tensor moe_recover_topK_op(
 
         break;
     }
-// #endif
+#endif
     default:
         throw std::runtime_error("Wrong activation tensor type.");
     }
@@ -819,7 +819,7 @@ std::tuple<torch::Tensor, torch::Tensor> moe_recover_topK_bwd_op(
 
         break;
     }
-// #ifdef ENABLE_BF16
+#ifdef ENABLE_BF16
     case at::ScalarType::BFloat16:
     {
         using dType = cutlass::bfloat16_t;
@@ -844,8 +844,8 @@ std::tuple<torch::Tensor, torch::Tensor> moe_recover_topK_bwd_op(
 
         break;
     }
-// #endif
-// #ifdef ENABLE_FP8
+#endif
+#ifdef ENABLE_FP8
     case at::ScalarType::Float8_e5m2:
     {
         using dType = cutlass::float_e5m2_t;
@@ -894,7 +894,7 @@ std::tuple<torch::Tensor, torch::Tensor> moe_recover_topK_bwd_op(
 
         break;
     }
-// #endif
+#endif
     default:
         throw std::runtime_error("Wrong activation tensor type.");
     }
diff --git a/setup.py b/setup.py
@@ -5,7 +5,18 @@
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
 
-if os.environ.get("TORCH_CUDA_ARCH_LIST"):
+# Supported NVIDIA GPU architectures.
+NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
+
+# TORCH_CUDA_ARCH_LIST can have one or more architectures,
+# e.g. "9.0" or "7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX". Here,
+# the "9.0+PTX" option asks the
+# compiler to additionally include PTX code that can be runtime-compiled
+# and executed on the 8.6 or newer architectures. While the PTX code will
+# not give the best performance on the newer architectures, it provides
+# forward compatibility.
+env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+if env_arch_list:
     # Let PyTorch builder to choose device to target for.
     device_capability = ""
 else:
@@ -16,6 +27,8 @@
 
 nvcc_flags = [
     "-std=c++17",  # NOTE: CUTLASS requires c++17
+    "-DENABLE_BF16", # Enable BF16 for cuda_version >= 11
+    # "-DENABLE_FP8",  # Enable FP8 for cuda_version >= 11.8
 ]
 
 if device_capability:

Original file line number	Diff line number	Diff line change
`@@ -523,7 +523,7 @@ std::tuple<torch::Tensor, torch::Tensor, std::vector<Tensor>> moe_permute_topK_o`
`523`	`523`
`524`	`524`	`break;`
`525`	`525`	`}`
`526`		`-// #ifdef ENABLE_BF16`
	`526`	`+#ifdef ENABLE_BF16`
`527`	`527`	`case at::ScalarType::BFloat16:`
`528`	`528`	`{`
`529`	`529`	`using dType = cutlass::bfloat16_t;`
`@@ -545,8 +545,8 @@ std::tuple<torch::Tensor, torch::Tensor, std::vector<Tensor>> moe_permute_topK_o`
`545`	`545`
`546`	`546`	`break;`
`547`	`547`	`}`
`548`		`-// #endif`
`549`		`-// #ifdef ENABLE_FP8`
	`548`	`+#endif`
	`549`	`+#ifdef ENABLE_FP8`
`550`	`550`	`case at::ScalarType::Float8_e5m2:`
`551`	`551`	`{`
`552`	`552`	`using dType = cutlass::float_e5m2_t;`
`@@ -589,7 +589,7 @@ std::tuple<torch::Tensor, torch::Tensor, std::vector<Tensor>> moe_permute_topK_o`
`589`	`589`
`590`	`590`	`break;`
`591`	`591`	`}`
`592`		`-// #endif`
	`592`	`+#endif`
`593`	`593`	`default:`
`594`	`594`	`throw std::runtime_error("Wrong activation tensor type.");`
`595`	`595`	`}`
`@@ -670,7 +670,7 @@ torch::Tensor moe_recover_topK_op(`
`670`	`670`
`671`	`671`	`break;`
`672`	`672`	`}`
`673`		`-// #ifdef ENABLE_BF16`
	`673`	`+#ifdef ENABLE_BF16`
`674`	`674`	`case at::ScalarType::BFloat16:`
`675`	`675`	`{`
`676`	`676`	`using dType = cutlass::bfloat16_t;`
`@@ -692,8 +692,8 @@ torch::Tensor moe_recover_topK_op(`
`692`	`692`
`693`	`693`	`break;`
`694`	`694`	`}`
`695`		`-// #endif`
`696`		`-// #ifdef ENABLE_FP8`
	`695`	`+#endif`
	`696`	`+#ifdef ENABLE_FP8`
`697`	`697`	`case at::ScalarType::Float8_e5m2:`
`698`	`698`	`{`
`699`	`699`	`using dType = cutlass::float_e5m2_t;`
`@@ -736,7 +736,7 @@ torch::Tensor moe_recover_topK_op(`
`736`	`736`
`737`	`737`	`break;`
`738`	`738`	`}`
`739`		`-// #endif`
	`739`	`+#endif`
`740`	`740`	`default:`
`741`	`741`	`throw std::runtime_error("Wrong activation tensor type.");`
`742`	`742`	`}`
`@@ -819,7 +819,7 @@ std::tuple<torch::Tensor, torch::Tensor> moe_recover_topK_bwd_op(`
`819`	`819`
`820`	`820`	`break;`
`821`	`821`	`}`
`822`		`-// #ifdef ENABLE_BF16`
	`822`	`+#ifdef ENABLE_BF16`
`823`	`823`	`case at::ScalarType::BFloat16:`
`824`	`824`	`{`
`825`	`825`	`using dType = cutlass::bfloat16_t;`
`@@ -844,8 +844,8 @@ std::tuple<torch::Tensor, torch::Tensor> moe_recover_topK_bwd_op(`
`844`	`844`
`845`	`845`	`break;`
`846`	`846`	`}`
`847`		`-// #endif`
`848`		`-// #ifdef ENABLE_FP8`
	`847`	`+#endif`
	`848`	`+#ifdef ENABLE_FP8`
`849`	`849`	`case at::ScalarType::Float8_e5m2:`
`850`	`850`	`{`
`851`	`851`	`using dType = cutlass::float_e5m2_t;`
`@@ -894,7 +894,7 @@ std::tuple<torch::Tensor, torch::Tensor> moe_recover_topK_bwd_op(`
`894`	`894`
`895`	`895`	`break;`
`896`	`896`	`}`
`897`		`-// #endif`
	`897`	`+#endif`
`898`	`898`	`default:`
`899`	`899`	`throw std::runtime_error("Wrong activation tensor type.");`
`900`	`900`	`}`