cpu : add batching and F16/I32 support to win_part/win_unpart ops/get_rel_pos

bluebread · bluebread · commit 4e6310c03294 · 2025-11-19T07:06:59.000Z
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -2279,18 +2279,16 @@ extern "C" {
             struct ggml_tensor  * ids);
 
     // partition into non-overlapping windows with padding if needed
-    // example:
-    // a:   768   64   64    1
-    // w:    14
-    // res: 768   14   14    25
-    // used in sam
+    // a: [B, H, W, C]
+    // result: [B*NPY*NPX, w, w, C]
+    // NPY = ceil(H/w)
+    // NPX = ceil(W/w)
     GGML_API struct ggml_tensor * ggml_win_part(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   w);
 
     // reverse of ggml_win_part
-    // used in sam
     GGML_API struct ggml_tensor * ggml_win_unpart(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2308,14 +2306,12 @@ extern "C" {
         struct ggml_tensor  * a,
         enum ggml_unary_op op);
 
-    // used in sam
     GGML_API struct ggml_tensor * ggml_get_rel_pos(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   qh,
             int                   kh);
 
-    // used in sam
     GGML_API struct ggml_tensor * ggml_add_rel_pos(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -8946,35 +8946,80 @@ static void ggml_compute_forward_win_part_f32(
 
     const ggml_tensor * src0 = dst->src[0];
 
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+    GGML_TENSOR_UNARY_OP_LOCALS
 
     const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
     const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
+    const int32_t bs   = ((const int32_t *)(dst->op_params))[2];
+    const int32_t w    = ((const int32_t *)(dst->op_params))[3];
 
     assert(ne00 == ne0);
-    assert(ne3  == nep0*nep1);
+    assert(ne3  == nep0*nep1*bs);
 
     // TODO: optimize / multi-thread
-    for (int py = 0; py < nep1; ++py) {
-        for (int px = 0; px < nep0; ++px) {
-            const int64_t i3 = py*nep0 + px;
-            for (int64_t i2 = 0; i2 < ne2; ++i2) {
-                for (int64_t i1 = 0; i1 < ne1; ++i1) {
-                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                        const int64_t i02 = py*w + i2;
-                        const int64_t i01 = px*w + i1;
-                        const int64_t i00 = i0;
-
-                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
-                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
-
-                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
-                            ((float *) dst->data)[i] = 0.0f;
-                        } else {
-                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
-                        }
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        int px = i3 % nep0;
+        int py = (i3 / nep0) % nep1;
+        int b  = i3 / (nep0 * nep1); 
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    const int64_t i03 = b;
+                    const int64_t i02 = py*w + i2;
+                    const int64_t i01 = px*w + i1;
+                    const int64_t i00 = i0;
+
+                    void * sp = ((void *) src0->data) + i03*nb03 + i02*nb02  + i01*nb01 + i00*nb00;
+                    void * dp = ((void *) dst->data)  + i3*nb3   + i2*nb2    + i1*nb1   + i0*nb0; 
+
+                    if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
+                        *((float *) dp) = 0;
+                    } else {
+                        *((float *) dp) = *((float *) sp);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_win_part_f16(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    GGML_UNUSED(params);
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t bs   = ((const int32_t *)(dst->op_params))[2];
+    const int32_t w    = ((const int32_t *)(dst->op_params))[3];
+
+    assert(ne00 == ne0);
+    assert(ne3  == nep0*nep1*bs);
+
+    // TODO: optimize / multi-thread
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        int px = i3 % nep0;
+        int py = (i3 / nep0) % nep1;
+        int b  = i3 / (nep0 * nep1); 
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    const int64_t i03 = b;
+                    const int64_t i02 = py*w + i2;
+                    const int64_t i01 = px*w + i1;
+                    const int64_t i00 = i0;
+
+                    void * sp = ((void *) src0->data) + i03*nb03 + i02*nb02  + i01*nb01 + i00*nb00;
+                    void * dp = ((void *) dst->data)  + i3*nb3   + i2*nb2    + i1*nb1   + i0*nb0; 
+
+                    if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
+                        *((ggml_fp16_t *) dp) = 0;
+                    } else {
+                        *((ggml_fp16_t *) dp) = *((ggml_fp16_t *) sp);
                     }
                 }
             }
@@ -8989,10 +9034,16 @@ void ggml_compute_forward_win_part(
     const ggml_tensor * src0 = dst->src[0];
 
     switch (src0->type) {
+        case GGML_TYPE_I32:
         case GGML_TYPE_F32:
             {
                 ggml_compute_forward_win_part_f32(params, dst);
             } break;
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_win_part_f16(params, dst);
+            } break;
         default:
             {
                 GGML_ABORT("fatal error");
@@ -9009,35 +9060,82 @@ static void ggml_compute_forward_win_unpart_f32(
 
     const ggml_tensor * src0 = dst->src[0];
 
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
+    GGML_TENSOR_UNARY_OP_LOCALS
 
     const int32_t w = ((const int32_t *)(dst->op_params))[0];
 
     // padding
     const int px = (w - ne1%w)%w;
-    //const int py = (w - ne2%w)%w;
+    const int py = (w - ne2%w)%w;
 
     const int npx = (px + ne1)/w;
-    //const int npy = (py + ne2)/w;
+    const int npy = (py + ne2)/w;
 
     assert(ne0 == ne00);
+    assert(ne03 == npx*npy*ne3);
 
     // TODO: optimize / multi-thread
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = 0; i1 < ne1; ++i1) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int ip2 = i2/w;
-                const int ip1 = i1/w;
+    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    const int ip2 = i2/w;
+                    const int ip1 = i1/w;
+    
+                    const int64_t i03 = i3*npx*npy + ip2*npx + ip1;
+                    const int64_t i02 = i2%w;
+                    const int64_t i01 = i1%w;
+                    const int64_t i00 = i0;
+    
+                    void * sp = ((void *) src0->data) + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00;
+                    void * dp = ((void *) dst->data)  + i3*nb3   + i2*nb2   + i1*nb1   + i0*nb0;
+
+                    *((float *) dp) = *((float *) sp);
+                }
+            }
+        }
+    }
+}
 
-                const int64_t i02 = i2%w;
-                const int64_t i01 = i1%w;
-                const int64_t i00 = i0;
+static void ggml_compute_forward_win_unpart_f16(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    GGML_UNUSED(params);
 
-                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
-                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int32_t w = ((const int32_t *)(dst->op_params))[0];
+
+    // padding
+    const int px = (w - ne1%w)%w;
+    const int py = (w - ne2%w)%w;
+
+    const int npx = (px + ne1)/w;
+    const int npy = (py + ne2)/w;
+
+    assert(ne0 == ne00);
+    assert(ne03 == npx*npy*ne3);
 
-                ((float *) dst->data)[j] = ((float *) src0->data)[i];
+    // TODO: optimize / multi-thread
+    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    const int ip2 = i2/w;
+                    const int ip1 = i1/w;
+    
+                    const int64_t i03 = i3*npx*npy + ip2*npx + ip1;
+                    const int64_t i02 = i2%w;
+                    const int64_t i01 = i1%w;
+                    const int64_t i00 = i0;
+    
+                    void * sp = ((void *) src0->data) + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00;
+                    void * dp = ((void *) dst->data)  + i3*nb3   + i2*nb2   + i1*nb1   + i0*nb0;
+
+                    *((ggml_fp16_t *) dp) = *((ggml_fp16_t *) sp);
+                }
             }
         }
     }
@@ -9050,10 +9148,16 @@ void ggml_compute_forward_win_unpart(
     const ggml_tensor * src0 = dst->src[0];
 
     switch (src0->type) {
+        case GGML_TYPE_I32:
         case GGML_TYPE_F32:
             {
                 ggml_compute_forward_win_unpart_f32(params, dst);
             } break;
+        case GGML_TYPE_BF16:
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_win_unpart_f16(params, dst);
+            } break;
         default:
             {
                 GGML_ABORT("fatal error");
@@ -9199,6 +9303,32 @@ void ggml_compute_forward_glu(
 
 // ggml_compute_forward_get_rel_pos
 
+static void ggml_compute_forward_get_rel_pos_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    GGML_UNUSED(params);
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int64_t w = ne1;
+
+    float * src0_data = (float *) src0->data;
+    float * dst_data  = (float *) dst->data;
+
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = 0; i1 < ne1; ++i1) {
+            const int64_t pos = (w - i1 - 1) + i2;
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
+            }
+        }
+    }
+}
+
 static void ggml_compute_forward_get_rel_pos_f16(
         const ggml_compute_params * params,
         ggml_tensor * dst) {
@@ -9232,6 +9362,10 @@ void ggml_compute_forward_get_rel_pos(
     const ggml_tensor * src0 = dst->src[0];
 
     switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_get_rel_pos_f32(params, dst);
+            } break;
         case GGML_TYPE_F16:
         case GGML_TYPE_BF16:
             {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -5315,21 +5315,19 @@ struct ggml_tensor * ggml_win_part(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   w) {
-    GGML_ASSERT(a->ne[3] == 1);
-    GGML_ASSERT(a->type  == GGML_TYPE_F32);
-
     // padding
     const int px = (w - a->ne[1]%w)%w;
     const int py = (w - a->ne[2]%w)%w;
 
+    const int bs = a->ne[3];
     const int npx = (px + a->ne[1])/w;
     const int npy = (py + a->ne[2])/w;
-    const int np  = npx*npy;
+    const int np  = npx*npy*bs;
 
     const int64_t ne[4] = { a->ne[0], w, w, np, };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    int32_t params[] = { npx, npy, w };
+    int32_t params[] = { npx, npy, bs, w };
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op     = GGML_OP_WIN_PART;
@@ -5346,10 +5344,20 @@ struct ggml_tensor * ggml_win_unpart(
         int                   w0,
         int                   h0,
         int                   w) {
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    return ggml_win_unpart_ext(ctx, a, w0, h0, 1, w);
+}
+
+struct ggml_tensor * ggml_win_unpart_ext(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   w0,
+        int                   h0,
+        int                   b0,
+        int                   w) {
+    const int64_t ne[4] = { a->ne[0], w0, h0, b0 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+    GGML_ASSERT(ggml_is_contiguous(a));
 
     int32_t params[] = { w };
     ggml_set_op_params(result, params, sizeof(params));
@@ -5367,8 +5375,7 @@ struct ggml_tensor * ggml_get_rel_pos(
         struct ggml_tensor  * a,
         int                   qh,
         int                   kh) {
-    GGML_ASSERT(qh == kh);
-    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
+    GGML_ASSERT(qh + kh - 1 <= a->ne[1]);
 
     const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
@@ -6421,6 +6428,7 @@ static void ggml_compute_backward(
         } break;
         case GGML_OP_WIN_PART:
         case GGML_OP_WIN_UNPART:
+        case GGML_OP_GET_REL_POS:
         case GGML_OP_UNARY: {
             switch (ggml_get_unary_op(tensor)) {
                 case GGML_UNARY_OP_ABS: {