pytorch · mcr229 · Jun 26, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
@@ -39,7 +39,6 @@
     op_quant_dequant,
     op_relu,
     op_rsqrt,
-    op_sdpa,
     op_sigmoid,
     op_skip_ops,
     op_slice_copy,

@@ -43,7 +43,6 @@
     QuantizedPerTensorConfig,
     ReciprocalSquareRootConfig,
     ReLUConfig,
-    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
@@ -99,7 +98,6 @@
     PreluConfig,
     ReciprocalSquareRootConfig,
     ReLUConfig,
-    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,

@@ -527,33 +527,3 @@ class BMMConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
-
-
-class SDPAConfig(GenericNodePartitionerConfig):
-    target_name = "scaled_dot_product_attention.default"
-
-    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
-        """
-        Requires Mask to have Rank 2
-        """
-        if not self.check_common_constraints(node, ep):
-            return False
-
-        if len(node.all_input_nodes) < 4:
-            return False
-        mask_node = node.all_input_nodes[3]
-        mask_rank = mask_node.meta["val"].dim()
-        if mask_rank != 2:
-            why(
-                node,
-                reason=f"mask must have rank 2, got mask of rank {mask_rank}",
-            )
-            return False
-
-        return True
-
-    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
-        return torch.ops.aten.scaled_dot_product_attention.default
-
-    def supported_precision_types(self) -> List[ConfigPrecisionType]:
-        return [ConfigPrecisionType.FP32]
@@ -1961,42 +1961,6 @@ Error defineStaticSliceNode(
   return Error::Ok;
 }
 
-/*
-Defines Scaled Dot Product Attention (SDPA) node into the subgraph,
-using the remapped ids to map the serialized ids,
-to the new ids generated when defining the tensor value
-*/
-Error defineScaledDotProductAttentionNode(
-    xnn_subgraph_t subgraph_ptr,
-    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
-    const NodePtr node,
-    const fb_xnnpack::XNNGraph* graph) noexcept {
-  MAYBE_UNUSED(graph);
-
-  auto graph_node = node->xnode_union_as_XNNScaledDotProductAttention();
-
-  xnn_status status = xnn_define_scaled_dot_product_attention(
-      subgraph_ptr,
-      xnn_attention_logits_cap_type_none, // cap_type
-      nullptr, // cap_value - not used
-      remapped_ids.at(graph_node->query_id()),
-      remapped_ids.at(graph_node->key_id()),
-      remapped_ids.at(graph_node->value_id()),
-      remapped_ids.at(graph_node->scale_id()),
-      remapped_ids.at(graph_node->mask_id()),
-      remapped_ids.at(graph_node->output_id()),
-      graph_node->flags());
-
-  ET_CHECK_OR_RETURN_ERROR(
-      status == xnn_status_success,
-      Internal,
-      "Failed to create SDPA node %i with code: %s",
-      node->debug_handle(),
-      xnn_status_to_string(status));
-
-  return Error::Ok;
-}
-
 /*
 Defines batch matrix multiply node into the subgraph,
 using the remapped ids to map the serialized ids,
@@ -2097,7 +2061,6 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Concatenate4)
     _DEFINE(Concatenate5)
     _DEFINE(StaticSlice)
-    _DEFINE(ScaledDotProductAttention)
     _DEFINE(BatchMatrixMultiply)
     case fb_xnnpack::XNodeUnion::NONE:
     default: // Adding here as a catch all, just in case
+4 −0		.clang-format
+397 −0		.github/workflows/build.yml
+9 −0		.github/workflows/on-pr-approved.yml
+13 −0		.github/workflows/on-pr-merge-to-main.yml
+11 −0		.github/workflows/on-push.yml
+0 −27		.gitignore
+0 −12		.travis.yml
+17 −10		BUILD.bazel
+37 −21		CMakeLists.txt
+28 −0		CONTRIBUTING.md
+72 −0		MODULE.bazel
+4 −3		README.md
+2 −2		WORKSPACE
+112 −63		bench/latency.cc
+414 −360		bench/throughput.cc
+9 −0		cmake/DownloadCpuinfo.cmake
+9 −0		cmake/DownloadFXdiv.cmake
+9 −0		cmake/DownloadGoogleBenchmark.cmake
+9 −0		cmake/DownloadGoogleTest.cmake
+8 −0		cmake/x64_arm64.toolchain
+42 −35		examples/addition.c
+1,502 −767		include/pthreadpool.h
+70 −0		scripts/build-android-arm64.sh
+73 −0		scripts/build-android-armv7.sh
+70 −0		scripts/build-android-x86.sh
+21 −0		scripts/build-windows-arm64.cmd
+21 −0		scripts/build-windows-x64.cmd
+21 −0		scripts/build-windows-x86.cmd
+1,943 −1,623		src/fastpath.c
+111 −99		src/gcd.c
+192 −193		src/legacy-api.c
+61 −39		src/memory.c
+5,458 −3,088		src/portable-api.c
+457 −407		src/pthreads.c
+700 −579		src/shim.c
+166 −855		src/threadpool-atomics.h
+89 −45		src/threadpool-common.h
+1,139 −712		src/threadpool-object.h
+103 −76		src/threadpool-utils.h
+351 −326		src/windows.c
+2,311 −1,365		test/pthreadpool-cxx.cc
+11,991 −7,934		test/pthreadpool.cc