Use FFI api to access device ids, this allows us to reduce cuda requirements to 12.3.

nshepperd · nshepperd · commit f1e975532d19 · 2025-08-20T17:33:04.000+10:00
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Please cite (see below) and credit FlashAttention if you use it.
 ## Installation
 
 Requirements:
-- CUDA 12.8 and above.
+- CUDA 12.3 and above.
 - Linux. Same story as with the pytorch repo. I haven't tested compilation of the jax bindings on windows.
 - JAX >= `0.5.*`. The custom call api changed in this version.
 
diff --git a/csrc/flash_attn/check.h b/csrc/flash_attn/check.h
@@ -66,4 +66,4 @@ class CheckHelper {
   if (auto _opt = (expr); _opt.has_value())                                                        \
     dest = _opt.value();                                                                           \
   else                                                                                             \
-    return CheckHelper(std::string(#expr))
+    return CheckHelper(std::string(#expr))
diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
@@ -307,6 +307,7 @@ XLA_FFI_DEFINE_HANDLER(
 	ffi::Ffi::Bind()
         .Ctx<ffi::PlatformStream<cudaStream_t>>()
 		.Ctx<ffi::ScratchAllocator>()
+		.Ctx<ffi::DeviceOrdinal>()
 		.Arg<ffi::AnyBuffer>()
 		.Arg<ffi::AnyBuffer>()
 		.Arg<ffi::AnyBuffer>()
@@ -323,6 +324,7 @@ XLA_FFI_DEFINE_HANDLER(
 	ffi::Ffi::Bind()
 		.Ctx<ffi::PlatformStream<cudaStream_t>>()
 		.Ctx<ffi::ScratchAllocator>()
+		.Ctx<ffi::DeviceOrdinal>()
 		.Arg<ffi::AnyBuffer>() // dout
 		.Arg<ffi::AnyBuffer>() // q
 		.Arg<ffi::AnyBuffer>() // k
@@ -343,6 +345,7 @@ XLA_FFI_DEFINE_HANDLER(
 	ffi::Ffi::Bind()
 		.Ctx<ffi::PlatformStream<cudaStream_t>>()
 		.Ctx<ffi::ScratchAllocator>()
+		.Ctx<ffi::DeviceOrdinal>()
 		.Arg<ffi::AnyBuffer>() // q
 		.Arg<ffi::AnyBuffer>() // k
 		.Arg<ffi::AnyBuffer>() // v
@@ -366,6 +369,7 @@ XLA_FFI_DEFINE_HANDLER(
 	ffi::Ffi::Bind()
 		.Ctx<ffi::PlatformStream<cudaStream_t>>()
 		.Ctx<ffi::ScratchAllocator>()
+		.Ctx<ffi::DeviceOrdinal>()
 		.Arg<ffi::AnyBuffer>() // dout
 		.Arg<ffi::AnyBuffer>() // q
 		.Arg<ffi::AnyBuffer>() // k
diff --git a/csrc/flash_attn/mha_bwd.cpp b/csrc/flash_attn/mha_bwd.cpp
@@ -110,6 +110,7 @@ void run_mha_bwd(Flash_bwd_params &params, cudaStream_t stream) {
 }
 
 ffi::Error mha_bwd_impl(cudaStream_t stream, ffi::ScratchAllocator scratch,
+                        int32_t device,
                         ffi::AnyBuffer dout, // batch_size x seqlen_q x num_heads x head_size_og
                         ffi::AnyBuffer q,    // batch_size x seqlen_q x num_heads x head_size
                         ffi::AnyBuffer k,    // batch_size x seqlen_k x num_heads_k x head_size
@@ -121,9 +122,8 @@ ffi::Error mha_bwd_impl(cudaStream_t stream, ffi::ScratchAllocator scratch,
                         ffi::Result<ffi::AnyBuffer> dv,    // batch_size x seqlen_k x num_heads_k x head_size
                         double softmax_scale, bool is_causal,
                         int64_t window_size_left, int64_t window_size_right) {
-	int device, major, minor, sm_count;
-    FFI_CUDA_CHECK(cudaStreamGetDevice(stream, &device));
-	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
+	int major, minor, sm_count;
+    FFI_CUDA_CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device));
 
@@ -249,6 +249,7 @@ ffi::Error
 mha_varlen_bwd_impl(
     cudaStream_t stream,
     ffi::ScratchAllocator scratch,
+    int32_t device,
     ffi::AnyBuffer dout,  // total_q x num_heads, x head_size
     ffi::AnyBuffer q,     // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
     ffi::AnyBuffer k,     // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -270,8 +271,7 @@ mha_varlen_bwd_impl(
     bool deterministic) {
 
     if (is_causal) { window_size_right = 0; }
-	int device, major, minor, sm_count;
-    FFI_CUDA_CHECK(cudaStreamGetDevice(stream, &device));
+	int major, minor, sm_count;
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device));
diff --git a/csrc/flash_attn/mha_bwd.h b/csrc/flash_attn/mha_bwd.h
@@ -10,6 +10,7 @@
 namespace ffi = xla::ffi;
 
 ffi::Error mha_bwd_impl(cudaStream_t stream, ffi::ScratchAllocator scratch,
+                        int32_t device,
                         ffi::AnyBuffer dout, ffi::AnyBuffer q, ffi::AnyBuffer k,
                         ffi::AnyBuffer v, ffi::AnyBuffer o,
                         ffi::Buffer<ffi::F32> lse, ffi::Result<ffi::AnyBuffer> dq,
@@ -21,6 +22,7 @@ ffi::Error
 mha_varlen_bwd_impl(
     cudaStream_t stream,
     ffi::ScratchAllocator scratch,
+    int32_t device,
     ffi::AnyBuffer dout,  // total_q x num_heads, x head_size
     ffi::AnyBuffer q,     // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
     ffi::AnyBuffer k,     // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
diff --git a/csrc/flash_attn/mha_fwd.cpp b/csrc/flash_attn/mha_fwd.cpp
@@ -30,12 +30,19 @@ void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split
 }
 
 
-ffi::Error mha_fwd_impl(cudaStream_t stream, ffi::ScratchAllocator scratch, ffi::AnyBuffer q, ffi::AnyBuffer k,
-             ffi::AnyBuffer v, ffi::Result<ffi::AnyBuffer> o,
-             ffi::ResultBuffer<ffi::F32> lse, double softmax_scale,
-            bool is_causal, int64_t window_size_left, int64_t window_size_right) {
-	int device, major, minor;
-    FFI_CUDA_CHECK(cudaStreamGetDevice(stream, &device));
+ffi::Error mha_fwd_impl(cudaStream_t stream, 
+    ffi::ScratchAllocator scratch, 
+    int32_t device,
+    ffi::AnyBuffer q, 
+    ffi::AnyBuffer k,
+    ffi::AnyBuffer v,
+    ffi::Result<ffi::AnyBuffer> o,
+    ffi::ResultBuffer<ffi::F32> lse,
+    double softmax_scale,
+    bool is_causal,
+    int64_t window_size_left,
+    int64_t window_size_right) {
+    int major, minor;
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
 
@@ -151,6 +158,7 @@ ffi::Error
 mha_varlen_fwd_impl(
     cudaStream_t stream,
     ffi::ScratchAllocator scratch,
+    int32_t device,
     ffi::AnyBuffer q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
     ffi::AnyBuffer k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
     ffi::AnyBuffer v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -187,8 +195,7 @@ mha_varlen_fwd_impl(
     //            const bool return_softmax,
     //            c10::optional<at::Generator> gen_) {
 
-	int device, major, minor, sm_count;
-    FFI_CUDA_CHECK(cudaStreamGetDevice(stream, &device));
+	int major, minor, sm_count;
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
 	FFI_CUDA_CHECK(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device));
diff --git a/csrc/flash_attn/mha_fwd.h b/csrc/flash_attn/mha_fwd.h
@@ -10,16 +10,25 @@
 
 namespace ffi = xla::ffi;
 
-ffi::Error mha_fwd_impl(cudaStream_t stream, ffi::ScratchAllocator scratch,
-                   ffi::AnyBuffer q, ffi::AnyBuffer k, ffi::AnyBuffer v,
-                   ffi::Result<ffi::AnyBuffer> o,
-                   ffi::ResultBuffer<ffi::F32> lse, double softmax_scale,
-                   bool is_causal, int64_t window_size_left, int64_t window_size_right);
+ffi::Error mha_fwd_impl(
+    cudaStream_t stream, 
+    ffi::ScratchAllocator scratch,
+    int32_t device,
+    ffi::AnyBuffer q,
+    ffi::AnyBuffer k,
+    ffi::AnyBuffer v,
+    ffi::Result<ffi::AnyBuffer> o,
+    ffi::ResultBuffer<ffi::F32> lse,
+    double softmax_scale,
+    bool is_causal,
+    int64_t window_size_left,
+    int64_t window_size_right);
 
 ffi::Error
 mha_varlen_fwd_impl(
     cudaStream_t stream,
     ffi::ScratchAllocator scratch,
+    int32_t device,
     ffi::AnyBuffer q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
     ffi::AnyBuffer k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
     ffi::AnyBuffer v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i