Synchronize stream in mha to prevent use-after-free of scratchpad memory.

nshepperd · nshepperd · commit e72fac9d67ce · 2025-08-20T04:49:27.000+10:00
diff --git a/.clang-format b/.clang-format
@@ -1,4 +1,4 @@
 ---
 Language: Cpp
 ColumnLimit: 100
-BasedOnStyle: Google
+# BasedOnStyle: Google
diff --git a/csrc/flash_attn/check.h b/csrc/flash_attn/check.h
@@ -39,32 +39,31 @@ class CheckHelper {
   std::ostringstream stream_;
 };
 
-#define FFI_CHECK(expr)                                                        \
-  static_assert(!std::is_same_v<decltype(expr), cudaError_t>,                  \
-                "Use FFI_CUDA_CHECK for CUDA error codes, not FFI_CHECK.");    \
-  if (!(expr))                                                                 \
+#define FFI_CHECK(expr)                                                                            \
+  static_assert(!std::is_same_v<decltype(expr), cudaError_t>,                                      \
+                "Use FFI_CUDA_CHECK for CUDA error codes, not FFI_CHECK.");                        \
+  if (!(expr))                                                                                     \
   return CheckHelper(#expr)
 
-#define FFI_CUDA_CHECK(expr)                                                   \
-  static_assert(std::is_same_v<decltype(expr), cudaError_t>,                   \
-                "Expect cudaError_t for FFI_CUDA_CHECK.");                     \
-  if (cudaError_t _cuda_check = (expr); _cuda_check != cudaSuccess)            \
-  return CheckHelper(std::string(#expr))                                       \
-         << " CUDA Error: " << cudaGetErrorString(_cuda_check)
+#define FFI_CUDA_CHECK(expr)                                                                       \
+  static_assert(std::is_same_v<decltype(expr), cudaError_t>,                                       \
+                "Expect cudaError_t for FFI_CUDA_CHECK.");                                         \
+  if (cudaError_t _cuda_check = (expr); _cuda_check != cudaSuccess)                                \
+  return CheckHelper(std::string(#expr)) << " CUDA Error: " << cudaGetErrorString(_cuda_check)
 
-#define FFI_CHECK_OPTIONAL(dest, expr)                                         \
-  if (auto _opt = (expr); _opt.has_value())                                    \
-    dest = _opt.value();                                                       \
-  else                                                                         \
+#define FFI_CHECK_OPTIONAL(dest, expr)                                                             \
+  if (auto _opt = (expr); _opt.has_value())                                                        \
+    dest = _opt.value();                                                                           \
+  else                                                                                             \
     return CheckHelper(std::string(#expr))
 
-#define FFI_RET_CHECK(expr)                                                    \
-  if (auto _error = (expr); !_error.success())                                 \
+#define FFI_RET_CHECK(expr)                                                                        \
+  if (auto _error = (expr); !_error.success())                                                     \
   return _error
 
-#define FFI_CHECK_ALLOC(dest, expr)                                         \
-  void* dest = nullptr;                                  \ 
-  if (auto _opt = (expr); _opt.has_value())                                    \
-    dest = _opt.value();                                                       \
-  else                                                                         \
+#define FFI_CHECK_ALLOC(dest, expr)                                                                \
+  void *dest = nullptr;                                                                            \
+  if (auto _opt = (expr); _opt.has_value())                                                        \
+    dest = _opt.value();                                                                           \
+  else                                                                                             \
     return CheckHelper(std::string(#expr))
diff --git a/csrc/flash_attn/mha_bwd.cpp b/csrc/flash_attn/mha_bwd.cpp
@@ -234,6 +234,7 @@ ffi::Error mha_bwd_impl(cudaStream_t stream, ffi::ScratchAllocator scratch,
 
     if (seqlen_q > 0) {
         launch(params, stream);
+        FFI_CUDA_CHECK(cudaStreamSynchronize(stream));
     } else {
         // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
         FFI_CUDA_CHECK(cudaMemset(dq->untyped_data(), 0, dq->size_bytes()));
@@ -414,6 +415,7 @@ mha_varlen_bwd_impl(
 
     if (max_seqlen_q > 0) {
         launch(params, stream);
+        FFI_CUDA_CHECK(cudaStreamSynchronize(stream));
     } else {
         // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
         FFI_CUDA_CHECK(cudaMemsetAsync(dq->untyped_data(), 0, dq->size_bytes(), stream));
diff --git a/csrc/flash_attn/mha_fwd.cpp b/csrc/flash_attn/mha_fwd.cpp
@@ -136,8 +136,7 @@ ffi::Error mha_fwd_impl(cudaStream_t stream, ffi::ScratchAllocator scratch, ffi:
 
     if (seqlen_k > 0) {
 		run_mha_fwd(params, stream);
-		// C10_CUDA_CHECK(cudaStreamSynchronize(stream));
-		// C10_CUDA_CHECK(cudaDeviceSynchronize());
+		FFI_CUDA_CHECK(cudaStreamSynchronize(stream));
     } else {
 		FFI_CHECK(false) << "seqlen_k is zero";
         // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
@@ -330,6 +329,7 @@ mha_varlen_fwd_impl(
 
     if (max_seqlen_k > 0) {
         run_mha_fwd(params, stream);
+        FFI_CUDA_CHECK(cudaStreamSynchronize(stream));
     } else {
         // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
         FFI_CUDA_CHECK(cudaMemsetAsync(out->untyped_data(), 0, out->size_bytes(), stream));
diff --git a/src/flash_attn_jax/__init__.py b/src/flash_attn_jax/__init__.py
@@ -1,4 +1,4 @@
-__version__ = 'v0.4.0'
+__version__ = 'v0.4.1'
 from .flash import flash_mha
 from .varlen import flash_mha_varlen
 __all__ = ['flash_mha', 'flash_mha_varlen']
diff --git a/tests/test_sharding.py b/tests/test_sharding.py
@@ -171,6 +171,7 @@ def check_sharding(sharding,q,k,v):
         out = flash((q,k,v))
         check(ref_out,ref16_out,out)
 
+    check_sharding(NamedSharding(mesh, P(None,None,None,None)), q, k, v)
     check_sharding(NamedSharding(mesh, P('x',None,None,None)), q, k, v)
     check_sharding(NamedSharding(mesh, P(None,None,'x',None)), q, k, v)