nshepperd
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎csrc/flash_attn/check.h‎
Lines changed: 63 additions & 16 deletions b/‎csrc/flash_attn/check.h‎
Lines changed: 63 additions & 16 deletions
diff --git a/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 50 additions & 51 deletions b/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 50 additions & 51 deletions
diff --git a/‎csrc/flash_attn/flash_common.cpp‎
Lines changed: 19 additions & 15 deletions b/‎csrc/flash_attn/flash_common.cpp‎
Lines changed: 19 additions & 15 deletions
diff --git a/‎csrc/flash_attn/flash_common.h‎
Lines changed: 6 additions & 20 deletions b/‎csrc/flash_attn/flash_common.h‎
Lines changed: 6 additions & 20 deletions
@@ -11,6 +11,8 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
 # == Find dependencies ==
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
 
+message(STATUS "Python executable: ${Python_EXECUTABLE}")
+
 execute_process(
     COMMAND ${Python_EXECUTABLE} -m pybind11 --cmakedir
     OUTPUT_VARIABLE pybind11_DIR
@@ -88,6 +90,8 @@ target_include_directories(flash_api PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/flash_attn
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/flash_attn/src
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc
+
 )
 
 target_link_libraries(flash_api PRIVATE
 
@@ -1,18 +1,65 @@
 #pragma once
 
-#include <stdio.h>
-
-inline void check_implementation(bool expr, std::string check_message) {
-	if (!expr) {
-		fprintf(stderr, "%s\n", check_message.c_str());
-		abort();
-	}
-}
-
-#define CHECK(EXPR, MESSAGE)												\
-  do {                                                              \
-    const bool __err = EXPR;                                 \
-    check_implementation(                                      \
-        __err,                                \
-		MESSAGE);							  \
-  } while (0)
+#include <cstdio>
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+#include <string>
+
+#include "xla/ffi/api/ffi.h"
+
+namespace ffi = xla::ffi;
+
+class CheckHelper {
+public:
+  explicit CheckHelper(std::string expr) : expr_(expr) {}
+
+  template <typename T> inline CheckHelper &operator<<(const T &value) {
+    fprintf(stderr, "debug: adding value %s\n", value);
+    stream_ << value;
+    return *this;
+  }
+
+  inline CheckHelper &operator<<(ffi::ErrorCode errc) {
+    errc_ = errc;
+    return *this;
+  }
+
+  inline operator ffi::Error() {
+    std::ostringstream full_message;
+    full_message << "Check failed: " << expr_;
+    std::string additional = stream_.str();
+    if (!additional.empty()) {
+      fprintf(stderr, "debug: %s\n", additional.c_str());
+      full_message << "; " << additional;
+    }
+    return ffi::Error(errc_, full_message.str());
+  }
+
+private:
+  ffi::ErrorCode errc_ = ffi::ErrorCode::kUnknown;
+  std::string expr_;
+  std::ostringstream stream_;
+};
+
+#define FFI_CHECK(expr)                                                        \
+  static_assert(!std::is_same_v<decltype(expr), cudaError_t>,                  \
+                "Use FFI_CUDA_CHECK for CUDA error codes, not FFI_CHECK.");    \
+  if (!(expr))                                                                 \
+  return CheckHelper(#expr)
+
+#define FFI_CUDA_CHECK(expr)                                                   \
+  static_assert(std::is_same_v<decltype(expr), cudaError_t>,                   \
+                "Expect cudaError_t for FFI_CUDA_CHECK.");                     \
+  if (cudaError_t _cuda_check = (expr); _cuda_check != cudaSuccess)            \
+  return CheckHelper(std::string(#expr))                                       \
+         << " CUDA Error: " << cudaGetErrorString(_cuda_check)
+
+#define FFI_CHECK_OPTIONAL(dest, expr)                                         \
+  if (auto _opt = (expr); _opt.has_value())                                    \
+    dest = _opt.value();                                                       \
+  else                                                                         \
+    return CheckHelper(std::string(#expr))
+
+#define FFI_RET_CHECK(expr)                                                    \
+  if (auto _error = (expr); !_error.success())                                 \
+  return _error
@@ -7,14 +7,14 @@
 #include <cuda_runtime_api.h>
 #include <pybind11/pybind11.h>
 
-#include "flash.h"
-#include "exception.h"
-#include "static_switch.h"
 #include "check.h"
 
-#include "flash_common.h"
 #include "mha_fwd.h"
 #include "mha_bwd.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/ffi.h"
+
+namespace ffi = xla::ffi;
 
 // std::vector<at::Tensor>
 // mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_heads x head_size
@@ -295,67 +295,66 @@
 
 namespace {
 
-template <typename T> pybind11::capsule EncapsulateFunction(T *fn) {
-  return pybind11::capsule(reinterpret_cast<void *>(fn), "xla._CUSTOM_CALL_TARGET");
-}
-
 template <typename T>
-inline std::string PackDescriptorAsString(const T& descriptor) {
-  return std::string(reinterpret_cast<const char*>(&descriptor), sizeof(T));
-}
-
-template <typename T> pybind11::bytes PackDescriptor(const T &descriptor) {
-  return pybind11::bytes(PackDescriptorAsString(descriptor));
+pybind11::capsule EncapsulateFfiCall(T *fn) {
+  static_assert(std::is_invocable_r_v<XLA_FFI_Error *, T, XLA_FFI_CallFrame *>,
+                "Encapsulated function must be an XLA FFI handler");
+  return pybind11::capsule(reinterpret_cast<void *>(fn));
 }
 
-pybind11::bytes make_mha_fwd_args(	float p_dropout,
-									float softmax_scale,
-									bool is_causal,
-									int window_size_left,
-									int window_size_right,
-									bool return_softmax,
-									int n, int l, int h, int d,
-									int l_k, int h_k,
-									ElementType dtype,
-									uint64_t seed) {
-	return PackDescriptor(mha_fwd_args{p_dropout, softmax_scale, is_causal, window_size_left, window_size_right, return_softmax, n, l, h, d, l_k, h_k, dtype, seed});
-}
-
-pybind11::bytes make_mha_bwd_args(	float p_dropout,
-									float softmax_scale,
-									bool is_causal,
-									int window_size_left,
-									int window_size_right,
-									bool deterministic,
-									int n, int l, int h, int d,
-									int l_k, int h_k,
-									ElementType dtype,
-									uint64_t seed) {
-	return PackDescriptor(mha_bwd_args{p_dropout, softmax_scale, is_causal, window_size_left, window_size_right, deterministic, n, l, h, d, l_k, h_k, dtype, seed});
-}
-
-pybind11::dict Registrations() {
+XLA_FFI_DEFINE_HANDLER(
+	mha_fwd, mha_fwd_impl,
+	ffi::Ffi::Bind()
+        .Ctx<ffi::PlatformStream<cudaStream_t>>()
+		.Ctx<ffi::ScratchAllocator>()
+		.Arg<ffi::AnyBuffer>()
+		.Arg<ffi::AnyBuffer>()
+		.Arg<ffi::AnyBuffer>()
+		.Ret<ffi::AnyBuffer>()
+		.Ret<ffi::Buffer<ffi::F32>>()
+		.Attr<double>("softmax_scale")
+		.Attr<bool>("is_causal")
+		.Attr<int64_t>("window_size_left")
+		.Attr<int64_t>("window_size_right")
+);
+
+XLA_FFI_DEFINE_HANDLER(
+	mha_bwd, mha_bwd_impl,
+	ffi::Ffi::Bind()
+		.Ctx<ffi::PlatformStream<cudaStream_t>>()
+		.Ctx<ffi::ScratchAllocator>()
+		.Arg<ffi::AnyBuffer>() // dout
+		.Arg<ffi::AnyBuffer>() // q
+		.Arg<ffi::AnyBuffer>() // k
+		.Arg<ffi::AnyBuffer>() // v
+		.Arg<ffi::AnyBuffer>() // o
+		.Arg<ffi::Buffer<ffi::F32>>() // lse
+		.Ret<ffi::AnyBuffer>() // dq
+		.Ret<ffi::AnyBuffer>() // dk
+		.Ret<ffi::AnyBuffer>() // dv
+		.Attr<double>("softmax_scale")
+		.Attr<bool>("is_causal")
+		.Attr<int64_t>("window_size_left")
+		.Attr<int64_t>("window_size_right")
+);
+
+
+pybind11::dict FFIRegistrations() {
   pybind11::dict dict;
-  dict["flash_mha_fwd"] = EncapsulateFunction(mha_fwd);
-  dict["flash_mha_bwd"] = EncapsulateFunction(mha_bwd);
+  dict["flash_mha_fwd"] = EncapsulateFfiCall(mha_fwd);
+  dict["flash_mha_bwd"] = EncapsulateFfiCall(mha_bwd);
   return dict;
 }
 
 
 PYBIND11_MODULE(flash_api, m) {
     m.doc() = "FlashAttention";
-	m.def("get_registrations", &Registrations);
-	m.def("make_flash_mha_fwd_args", &make_mha_fwd_args);
-	m.def("make_flash_mha_bwd_args", &make_mha_bwd_args);
-	pybind11::enum_<ElementType>(m, "ElementType")
-		.value("BF16", BF16)
-		.value("FP16", FP16)
-		.export_values();
+	m.def("get_ffi_registrations", &FFIRegistrations);
 
     // m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)");
     // m.def("bwd", &mha_bwd, "Backward pass");
     // m.def("varlen_bwd", &mha_varlen_bwd, "Backward pass (variable length)");
     // m.def("fwd_kvcache", &mha_fwd_kvcache, "Forward pass, with KV-cache");
 }
 
-}
+} // namespace
@@ -5,13 +5,14 @@
 #include <pybind11/pybind11.h>
 
 #include "flash.h"
-#include "exception.h"
-#include "static_switch.h"
 #include "check.h"
 #include "flash_common.h"
+#include "xla/ffi/api/ffi.h"
 
-void set_params_fprop(Flash_fwd_params &params,
-					  ElementType element_type,
+namespace ffi = xla::ffi;
+
+ffi::Error set_params_fprop(Flash_fwd_params &params,
+					  ffi::DataType element_type,
                       // sizes
                       const size_t b,
                       const size_t seqlen_q,
@@ -41,7 +42,7 @@ void set_params_fprop(Flash_fwd_params &params,
     // Reset the parameters
     memset(&params, 0, sizeof(params));
 
-    params.is_bf16 = element_type == BF16;
+    params.is_bf16 = element_type == ffi::DataType::BF16;
 
     // Set the pointers and strides.
     params.q_ptr = q_ptr;
@@ -110,7 +111,7 @@ void set_params_fprop(Flash_fwd_params &params,
     params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
     params.rp_dropout = 1.f / params.p_dropout;
     params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
-    CHECK(p_dropout < 1.f, "dropout must be <1");
+    FFI_CHECK(p_dropout < 1.f) << "dropout must be <1";
 
     // Causal is the special case where window_size_right == 0 and window_size_left < 0.
     // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
@@ -122,6 +123,8 @@ void set_params_fprop(Flash_fwd_params &params,
     params.window_size_right = window_size_right;
 
     params.is_seqlens_k_cumulative = true;
+
+    return ffi::Error(); // Success
 }
 
 // Find the number of splits that maximizes the occupancy. For example, if we have
@@ -166,10 +169,10 @@ int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs, int num_n_blocks
     return 1;
 }
 
-void set_params_splitkv(Flash_fwd_params &params, const int batch_size,
+ffi::Error set_params_splitkv(ffi::ScratchAllocator* scratch, Flash_fwd_params& params, const int batch_size,
 						const int num_heads, const int head_size, const int max_seqlen_k, const int max_seqlen_q,
 						const int head_size_rounded, const float p_dropout,
-						const int num_splits, int multiProcessorCount, ElementType dtype) {
+						const int num_splits, int multiProcessorCount, ffi::DataType dtype) {
     // This needs to match with run_mha_fwd_splitkv_dispatch
     const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
     const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n;
@@ -184,13 +187,14 @@ void set_params_splitkv(Flash_fwd_params &params, const int batch_size,
             params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, multiProcessorCount, num_n_blocks, 128);
         }
         if (params.num_splits > 1) {
-            // at::Tensor softmax_lse_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
-            // at::Tensor out_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
-			C10_CUDA_CHECK(cudaMalloc((void**)&params.softmax_lseaccum_ptr, params.num_splits * batch_size * num_heads * max_seqlen_q * 4)); // float32
-			C10_CUDA_CHECK(cudaMalloc((void**)&params.oaccum_ptr, params.num_splits * batch_size * num_heads * max_seqlen_q * head_size_rounded * 4));
-            // params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
-            // params.oaccum_ptr = out_accum.data_ptr();
+            FFI_CHECK_OPTIONAL(*(void**)&params.softmax_lseaccum_ptr, scratch->Allocate(
+                params.num_splits * batch_size * num_heads * max_seqlen_q * 4, 4))
+                << "Failed to allocate memory for softmax_lseaccum";
+            FFI_CHECK_OPTIONAL(*(void**)&params.oaccum_ptr, scratch->Allocate(
+                params.num_splits * batch_size * num_heads * max_seqlen_q * head_size_rounded * 4, 4))
+                << "Failed to allocate memory for oaccum";
         }
-        CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
+        FFI_CHECK(params.num_splits <= 128) << "num_splits > 128 not supported - " << params.num_splits;
     }
+    return ffi::Error();
 }
@@ -6,14 +6,13 @@
 #include <pybind11/pybind11.h>
 
 #include "flash.h"
-#include "exception.h"
-#include "static_switch.h"
 #include "check.h"
+#include "xla/ffi/api/ffi.h"
 
-enum ElementType { BF16, FP16, FP32 };
+namespace ffi = xla::ffi;
 
-void set_params_fprop(Flash_fwd_params &params,
-					  ElementType element_type,
+ffi::Error set_params_fprop(Flash_fwd_params &params,
+					            ffi::DataType element_type,
                       // sizes
                       const size_t b,
                       const size_t seqlen_q,
@@ -40,20 +39,7 @@ void set_params_fprop(Flash_fwd_params &params,
                       int window_size_right,
                       bool seqlenq_ngroups_swapped=false);
 
-void set_params_splitkv(Flash_fwd_params &params, const int batch_size,
+ffi::Error set_params_splitkv(ffi::ScratchAllocator* scratch, Flash_fwd_params& params, const int batch_size,
 						const int num_heads, const int head_size, const int max_seqlen_k, const int max_seqlen_q,
 						const int head_size_rounded, const float p_dropout,
-						const int num_splits, int multiProcessorCount, ElementType dtype);
-
-template <typename T>
-inline std::string Pack(const T& args) {
-  return std::string(reinterpret_cast<const char*>(&args), sizeof(T));
-}
-
-template <typename T>
-inline T Unpack(const void* opaque, size_t opaque_len) {
-	T out;
-	CHECK(sizeof(out)==opaque_len, "opaque len");
-	memcpy(&out, opaque, opaque_len);
-	return out;
-}
+						const int num_splits, int multiProcessorCount, ffi::DataType dtype);