Move hip_fmha op schemas next to their implementations (#245)

bottler · meta-codesync[bot] · commit a271a6dd88c3 · 2026-03-25T07:23:43.000-07:00
Summary: Pull Request resolved: #245 When `MSLK_BUILD_HIP_FMHA=0`, `attention.cpp` was still compiled into the main library and registered schemas for `efficient_attention_forward_ck`, `efficient_attention_backward_ck`, and `_ck_rand_uniform` via `TORCH_LIBRARY_FRAGMENT`, even though the `TORCH_LIBRARY_IMPL` bindings and kernel implementations (in `mslk_hip_fmha`) were absent. This left unimplemented operators registered in the library — the op appears in the dispatcher but calling it fails. Fix by moving each `m.def` into the same file as its `TORCH_LIBRARY_IMPL`, inside `hip_fmha/`. Since those files are only compiled as part of the `mslk_hip_fmha` static library, schema and implementation now come and go together. The decoder ops remain in `attention.cpp` since their situation differs. Reviewed By: cthi Differential Revision: D97933992 fbshipit-source-id: 121f9ce6c707288a6923fd1d62a5611a90659cf6
diff --git a/csrc/attention/ck/fmha/attention.cpp b/csrc/attention/ck/fmha/attention.cpp
@@ -22,22 +22,14 @@ PyMODINIT_FUNC PyInit__C(void) {
 
 TORCH_LIBRARY_FRAGMENT(xformers, m) {
 #if defined(USE_ROCM)
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_forward_ck(Tensor query, "
-      "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
-      "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
-      "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size, Tensor? block_tables, int? page_size) -> (Tensor, Tensor?, int, int)"));
+  // Schemas for ops whose implementations live in hip_fmha/ are registered
+  // there, alongside their TORCH_LIBRARY_IMPL, so that they are absent from
+  // builds where hip_fmha is not compiled (e.g. MSLK_BUILD_HIP_FMHA=0).
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_ck(Tensor query, "
       "Tensor key, Tensor value, Tensor? seq_positions, float scale) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "xformers::efficient_attention_forward_decoder_splitk_ck(Tensor query, Tensor key, "
       " Tensor value, Tensor? seq_positions, float scale, int split_k) -> Tensor"));
-#ifndef FMHA_OMIT_BACKWARD
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, int? max_seqlen_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale, int? window_size) -> (Tensor, Tensor, Tensor, Tensor)"));
-#endif
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
 #endif
 }
diff --git a/csrc/attention/ck/fmha/hip_fmha/attention_backward_generic_ck_tiled.cpp b/csrc/attention/ck/fmha/hip_fmha/attention_backward_generic_ck_tiled.cpp
@@ -631,6 +631,11 @@ efficient_attention_backward_ck_meta(
 
 } // namespace
 
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_backward_ck(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, Tensor? seqstart_k, int? max_seqlen_q, int? max_seqlen_k, Tensor? seqlen_k, Tensor logsumexp, Tensor output, float dropout_p, int rng_seed, int rng_offset, int custom_mask_type, float? scale, int? window_size) -> (Tensor, Tensor, Tensor, Tensor)"));
+}
+
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_ck"),
diff --git a/csrc/attention/ck/fmha/hip_fmha/attention_ck_rand_uniform.cpp b/csrc/attention/ck/fmha/hip_fmha/attention_ck_rand_uniform.cpp
@@ -94,6 +94,11 @@ at::Tensor rand_uniform_int(
 
 } // namespace
 
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::_ck_rand_uniform(float p, Tensor out) -> Tensor"));
+}
+
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::_ck_rand_uniform"),
diff --git a/csrc/attention/ck/fmha/hip_fmha/attention_forward_generic_ck_tiled.cpp b/csrc/attention/ck/fmha/hip_fmha/attention_forward_generic_ck_tiled.cpp
@@ -524,6 +524,14 @@ efficient_attention_forward_ck_meta(
 
 } // namespace
 
+TORCH_LIBRARY_FRAGMENT(xformers, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "xformers::efficient_attention_forward_ck(Tensor query, "
+      "Tensor key, Tensor value, Tensor? attn_bias, Tensor? seqstart_q, "
+      "Tensor? seqstart_k, int? max_seqlen_q, float dropout_p, "
+      "bool compute_logsumexp, int custom_mask_type, float? scale, Tensor? seqlen_k, int? window_size, Tensor? block_tables, int? page_size) -> (Tensor, Tensor?, int, int)"));
+}
+
 TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_ck"),