Support activation broadcasting in XNNPACK Matmul (microsoft#24908)

hariharans29 · github-actions[bot] · web-flow · commit 3426f646a1c1 · 2025-06-03T13:04:00.000-07:00
### Description 1. Support activation broadcasting in XNNPACK Matmul 2. Fix a subtle bug when activations is 1-D Per the existing gating logic, 1-D activations were allowed but the batch being passed through did not account for it. The batch size passed in was always `a->Shape()[0]` which is actually passing in the reduction dimension (K). This is incorrect as for a 1-D activation input, a `1` is to be prepended to the shape which meant that we should have actually passed in `1` for the batch. This passed the relevant test but I think it would have written outside the bounds of the output buffer because of the non-unary batch being passed through. ### Motivation and Context Resolve microsoft#24107 --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc
@@ -41,9 +41,8 @@ bool MatMul::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& g
       break;
     }
 
-    if (A_shape == nullptr || A_shape->dim_size() > 2 ||
-        (A_shape->dim_size() == 2 && A_shape->dim(1).dim_value() == 0) ||
-        A_shape->dim(0).dim_value() == 0) {
+    // A must at-least be 1-D
+    if (A_shape == nullptr || A_shape->dim_size() < 1) {
       break;
     }
 
@@ -162,10 +161,28 @@ Status MatMul::Compute(OpKernelContext* ctx) const {
   xnn_status status = xnn_status_success;
 
   pthreadpool_t threadpool = GetThreadPool();
+
+  // If the input 'A' is 1-D, then it is prepended with 1 and hence batch will be 1
+  size_t batch = 1;
+
+  const auto& a_dims = a->Shape();
+  int64_t rank = a_dims.NumDimensions();
+
+  if (rank == 2) {
+    batch = a_dims[0];
+  } else if (rank > 2) {
+    // Input 'A' is N-dimensional, the batch is made up of the product of the outermost dims
+    // (excluding the actual inner reduction dim)
+
+    for (int64_t i = 0; i < rank - 1; ++i) {
+      batch *= a_dims[i];
+    }
+  }
+
   if (op_type_ == OpComputeType::op_compute_type_fp32) {
-    status = xnn_reshape_fully_connected_nc_f32(op0_.get(), a->Shape()[0], threadpool);
+    status = xnn_reshape_fully_connected_nc_f32(op0_.get(), batch, threadpool);
   } else if (op_type_ == OpComputeType::op_compute_type_fp16) {
-    status = xnn_reshape_fully_connected_nc_f16(op0_.get(), a->Shape()[0], threadpool);
+    status = xnn_reshape_fully_connected_nc_f16(op0_.get(), batch, threadpool);
   }
 
   if (status != xnn_status_success) {
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -61,6 +61,13 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
        {3, 2, 1, 2},
        real_expected_vals({2, 3, 6, 7, 6, 11, 26, 31, 10, 19, 46, 55})});
 
+  test_cases.push_back(
+      {"test padding and broadcast A > B - no broadcast in B",
+       {2, 2, 3, 2},
+       {2, 1},
+       {2, 2, 3, 1},
+       real_expected_vals({1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23})});
+
   test_cases.push_back(
       {"test padding and broadcast B > A",
        {2, 3, 2},