feat(ggml-metal): Support arbitrary dim and non-cont in cumsum

gabe-l-hart · gabe-l-hart · commit ee13af13b205 · 2025-10-24T14:57:17.000-06:00
Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -320,8 +320,6 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum_rows(ggml_metal_librar
 }
 
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum(ggml_metal_library_t lib, const ggml_tensor * op) {
-    GGML_ASSERT(op->src[0]->nb[0] == ggml_type_size(op->src[0]->type));
-
     char base[256];
     char name[256];
 
@@ -338,7 +336,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum(ggml_metal_library_
     }
 
     // one shared memory element for each simd group in the threadgroup
-    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne);
     const int nsg = (ne00 + 31)/32;
     ggml_metal_pipeline_set_smem(res, nsg*sizeof(float));
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -665,6 +665,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_TRI:
             return ggml_is_contiguous_rows(op->src[0]);
         case GGML_OP_CUMSUM:
+            return has_simdgroup_reduction;
         case GGML_OP_SUM:
             return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
         case GGML_OP_SUM_ROWS:
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -585,6 +585,7 @@ typedef struct {
     uint64_t nb1;
     uint64_t nb2;
     uint64_t nb3;
+    int32_t  dim;
 } ggml_metal_kargs_cumsum;
 
 typedef struct {
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -971,6 +971,8 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
     GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
     GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
 
+    const int32_t dim = (int32_t) op->op_params[0];
+
     ggml_metal_kargs_cumsum args = {
         /*.ne00 =*/ ne00,
         /*.ne01 =*/ ne01,
@@ -988,18 +990,31 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
         /*.nb1  =*/ nb1,
         /*.nb2  =*/ nb2,
         /*.nb3  =*/ nb3,
+        /*.dim  =*/ dim
     };
 
     ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cumsum(lib, op);
 
+    // Dimension being accumulated
+    const int64_t ne_dim = op->src[0]->ne[dim];
+
+    // Grid dimensions: the GGML_MAX_DIMS-1 non-cumsum dimensions
+    int64_t grid_dims[GGML_MAX_DIMS - 1];
+    int grid_idx = 0;
+    for (int d = 0; d < GGML_MAX_DIMS; ++d) {
+        if (d != dim) {
+            grid_dims[grid_idx++] = op->src[0]->ne[d];
+        }
+    }
+
     int nth = 32; // SIMD width
 
-    while (nth < ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+    while (nth < ne_dim && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
         nth *= 2;
     }
 
     nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
-    nth = std::min(nth, ne00);
+    nth = std::min(nth, (int)ne_dim);
 
     const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
 
@@ -1010,7 +1025,7 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
 
     ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, grid_dims[0], grid_dims[1], grid_dims[2], nth, 1, 1);
 
     return 1;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1853,32 +1853,54 @@ kernel void kernel_cumsum(
         ushort  sgitg[[simdgroup_index_in_threadgroup]],
         ushort  tiisg[[thread_index_in_simdgroup]],
         ushort3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
 
-    if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
+    // Figure out the dize and stride of the cumsum dim
+    const int64_t ne_dim = (args.dim == 0) ? args.ne00 : (args.dim == 1) ? args.ne01 : (args.dim == 2) ? args.ne02 : args.ne03;
+    const int64_t nb_dim_src = (args.dim == 0) ? args.nb00 : (args.dim == 1) ? args.nb01 : (args.dim == 2) ? args.nb02 : args.nb03;
+    const int64_t nb_dim_dst = (args.dim == 0) ? args.nb0  : (args.dim == 1) ? args.nb1  : (args.dim == 2) ? args.nb2  : args.nb3;
+
+    // Map threadgroup indices to actual tensor dimensions
+    // tgpig.x, tgpig.y, tgpig.z represent the 3 non-cumsum dimensions
+    // tpitg.x represents position in the cumsum dimension
+    int64_t grid_indices[3] = {int64_t(tgpig.x), int64_t(tgpig.y), int64_t(tgpig.z)};
+    int64_t i_vals[4];
+
+    int grid_idx = 0;
+    for (int d = 0; d < 4; ++d) {
+        if (d == args.dim) {
+            i_vals[d] = 0; // Will be set in the loop below
+        } else {
+            i_vals[d] = grid_indices[grid_idx++];
+        }
+    }
+
+    // Base index offsets. The cumsum dim will be further offset by the position
+    // in the threadgroup
+    const int64_t i0 = i_vals[0];
+    const int64_t i1 = i_vals[1];
+    const int64_t i2 = i_vals[2];
+    const int64_t i3 = i_vals[3];
+
+    if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01 || i0 >= args.ne00) {
         return;
     }
 
-    device const T * src_row = (device const T *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
-    device       T * dst_row = (device       T *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
+    // Each thread processes elements at stride ntg.x along the cumsum dimension
+    for (int64_t i_dim = tpitg.x; i_dim < ne_dim; i_dim += ntg.x) {
+        const int64_t offset_src = i0*args.nb00 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03 + i_dim*nb_dim_src;
+        const int64_t offset_dst = i0*args.nb0  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3  + i_dim*nb_dim_dst;
 
-    // Each thread is a single element of the row if ne00 < max threads per
-    // threadgroup, so this will loop once for each index that this thread is
-    // responsible for
-    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        device const T * src_ptr = (device const T *) ((device const char *) src0 + offset_src);
+        device       T * dst_ptr = (device       T *) ((device       char *) dst  + offset_dst);
 
-        // Each thread does simd_prefix_inclusive_sum => every element of row
-        // now holds cumsum of the simd group
-        float sumf = static_cast<float>(src_row[i0]);
+        // Each thread does simd_prefix_inclusive_sum
+        float sumf = static_cast<float>(src_ptr[0]);
         sumf = simd_prefix_inclusive_sum(sumf);
-        dst_row[i0] = static_cast<T>(sumf);
+        dst_ptr[0] = static_cast<T>(sumf);
 
-        // If this is the last element of the simd group, store its value in
-        // shared memory
-        if (tiisg == N_SIMDWIDTH - 1 || i0 == args.ne00 - 1) {
-            const ushort shmem_idx = i0 / N_SIMDWIDTH;
+        // If this is the last element of the simd group, store its value in shared memory
+        if (tiisg == N_SIMDWIDTH - 1 || i_dim == ne_dim - 1) {
+            const ushort shmem_idx = i_dim / N_SIMDWIDTH;
             shmem_f32[shmem_idx] = sumf;
         }
     }
@@ -1887,10 +1909,13 @@ kernel void kernel_cumsum(
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
     // Each element then adds the final value of all preceding simd groups
-    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
-        const ushort shmem_idx = i0 / N_SIMDWIDTH;
+    for (int64_t i_dim = tpitg.x; i_dim < ne_dim; i_dim += ntg.x) {
+        const int64_t offset_dst = i0*args.nb0 + i1*args.nb1 + i2*args.nb2 + i3*args.nb3 + i_dim*nb_dim_dst;
+        device T * dst_ptr = (device T *) ((device char *) dst + offset_dst);
+
+        const ushort shmem_idx = i_dim / N_SIMDWIDTH;
         for (ushort j = 0; j < shmem_idx; ++j) {
-            dst_row[i0] += static_cast<T>(shmem_f32[j]);
+            dst_ptr[0] += static_cast<T>(shmem_f32[j]);
         }
     }
 }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -4865,7 +4865,7 @@ struct test_cumsum : public test_case {
     const std::array<int64_t, 4> permute;
 
     std::string vars() override {
-        return VARS_TO_STR2(type, ne);
+        return VARS_TO_STR4(type, ne, dim, permute);
     }
 
     test_cumsum(ggml_type type = GGML_TYPE_F32,

Original file line number	Diff line number	Diff line change
`@@ -320,8 +320,6 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum_rows(ggml_metal_librar`
`320`	`320`	`}`
`321`	`321`
`322`	`322`	`ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum(ggml_metal_library_t lib, const ggml_tensor * op) {`
`323`		`- GGML_ASSERT(op->src[0]->nb[0] == ggml_type_size(op->src[0]->type));`
`324`		`-`
`325`	`323`	`char base[256];`
`326`	`324`	`char name[256];`
`327`	`325`
`@@ -338,7 +336,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum(ggml_metal_library_`
`338`	`336`	`}`
`339`	`337`
`340`	`338`	`// one shared memory element for each simd group in the threadgroup`
`341`		`- GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);`
	`339`	`+ GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne);`
`342`	`340`	`const int nsg = (ne00 + 31)/32;`
`343`	`341`	`ggml_metal_pipeline_set_smem(res, nsg*sizeof(float));`
`344`	`342`
Original file line number	Diff line number	Diff line change
`@@ -4865,7 +4865,7 @@ struct test_cumsum : public test_case {`
`4865`	`4865`	`const std::array<int64_t, 4> permute;`
`4866`	`4866`
`4867`	`4867`	`std::string vars() override {`
`4868`		`- return VARS_TO_STR2(type, ne);`
	`4868`	`+ return VARS_TO_STR4(type, ne, dim, permute);`
`4869`	`4869`	`}`
`4870`	`4870`
`4871`	`4871`	`test_cumsum(ggml_type type = GGML_TYPE_F32,`