revert 11380

Nexesenex · Nexesenex · commit 049d544da8f7 · 2025-01-26T01:01:42.000+01:00
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -7921,7 +7921,7 @@ static void ggml_compute_forward_out_prod_f32(
 
                     float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
                     float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1   + i2*nb2   + i3*nb3));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
                     ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
                 }
@@ -7930,7 +7930,7 @@ static void ggml_compute_forward_out_prod_f32(
 
                     float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
                     float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1   + i2*nb2   + i3*nb3));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
                     ggml_vec_mad_f32(ne0, d, s0, *s1);
                 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -416,8 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
         case GGML_OP_IM2COL_BACK:
             return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
         case GGML_OP_OUT_PROD:
-            return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
-                src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
+            return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
         default:
             return true;
     }
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
@@ -93,31 +93,26 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
 
 template <typename T>
 static __global__ void k_repeat_back(
-    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
-    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
+    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2) {
 
-    const int64_t tid0  = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
-    const int64_t tid1  = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
-    const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
-    const int64_t tid2  = tid23 % ne2;
-    const int64_t tid3  = tid23 / ne2;
+    const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
+    const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
+    const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
 
     if (tid0 >= ne0) {
         return;
     }
 
     T sum = 0;
-    for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
-        for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
-            for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
-                for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
-                    sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
-                }
+    for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
+        for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
+            for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
+                sum += src[i2*ne01*ne00 + i1*ne00 + i0];
             }
         }
     }
-    dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
+    dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
 }
 
 template<float (*bin_op)(const float, const float)>
@@ -279,14 +274,12 @@ struct bin_bcast_cuda {
 
 template <typename T>
 static void repeat_back_cuda(
-    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
-    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
-    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
+    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
 
     const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
-    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
-        (src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
+    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
+    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
 }
 
 template<class op>
@@ -333,26 +326,27 @@ void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     const ggml_tensor * src0 = dst->src[0];
 
     GGML_ASSERT(src0->type == dst->type);
+    GGML_ASSERT(ggml_is_contiguous(src0));
     GGML_ASSERT(ggml_is_contiguous(dst));
     GGML_ASSERT(ggml_can_repeat(dst, src0));
 
     cudaStream_t stream = ctx.stream();
 
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    GGML_ASSERT(ne2*ne3 <= (1 << 15));
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    GGML_ASSERT(src0->ne[3] == 1);
 
-    const size_t ts = ggml_type_size(src0->type);
-    const size_t s00 = nb00 / ts;
-    const size_t s01 = nb01 / ts;
-    const size_t s02 = nb02 / ts;
-    const size_t s03 = nb03 / ts;
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    GGML_ASSERT(dst->ne[3] == 1);
 
     switch (dst->type) {
         case GGML_TYPE_F32: {
             const float * src0_d = (const float *) src0->data;
             float       * dst_d  = (float       *) dst->data;
-            repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
+            repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
         } break;
         default: {
             GGML_ASSERT(false);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3007,7 +3007,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
             } break;
         case GGML_OP_REPEAT_BACK:
-                return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
+                return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;
         case GGML_OP_CONCAT:
             {
                 ggml_type src0_type = op->src[0]->type;
diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu
@@ -34,9 +34,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     CUBLAS_CHECK(cublasSetStream(handle, stream));
 
-    const int64_t lda = nb01 / sizeof(float);
-    const int64_t ldc = nb1  / sizeof(float);
-
     const bool src1_T = ggml_is_transposed(src1);
     const cublasOperation_t src1_cublas_op =  src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
     const int64_t           ldb            = (src1_T ?        nb10 :        nb11) /  sizeof(float);
@@ -60,9 +57,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
             CUBLAS_CHECK(
                 cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
                         ne0, ne1, ne01,
-                        &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
+                        &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, ne00,
                                 src1_d +  i3      *s13 +  i2      *s12, ldb,
-                        &beta,  dst_d  +  i3      *s3  +  i2      *s2,  ldc));
+                        &beta,  dst_d  +  i3      *s3  +  i2      *s2,  ne0));
         }
     }
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -5352,7 +5352,7 @@ static void ggml_compute_backward(
         } break;
         case GGML_OP_MUL: {
             if (src0_needs_grads) {
-                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, src1));
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad));
             }
             if (src1_needs_grads) {
                 struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
@@ -5444,25 +5444,21 @@ static void ggml_compute_backward(
             // src1.shape   [n,p,qq,rr]
 
             if (src0_needs_grads) {
-                GGML_ASSERT(grad->ne[2] == src1->ne[2]);
-                GGML_ASSERT(grad->ne[3] == src1->ne[3]);
-                struct ggml_tensor * tmp =
+                struct ggml_tensor * s1_tg =
                     ggml_out_prod(ctx, // [n,m,qq,rr]
                         src1,          // [n,p,qq,rr]
                         grad);         // [m,p,qq,rr]
-                if (!ggml_are_same_shape(tmp, src0)) {
-                    GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
-                    GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
-                    GGML_ASSERT(tmp->ne[3] == 1);
-
-                    const int64_t nr2 = tmp->ne[2] / src0->ne[2];
-                    const size_t nb2 = tmp->nb[2] * nr2;
-                    const size_t nb3 = tmp->nb[2];
-
-                    tmp = ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
-                    tmp = ggml_repeat_back(ctx, tmp, src0);
+                const int64_t qq = s1_tg->ne[2];
+                const int64_t rr = s1_tg->ne[3];
+                const int64_t q1 = src0->ne[2];
+                const int64_t r1 = src0->ne[3];
+                const bool ne2_broadcasted = qq > q1;
+                const bool ne3_broadcasted = rr > r1;
+                if (ne2_broadcasted || ne3_broadcasted) {
+                    // sum broadcast repetitions of s1_tg into shape of src0
+                    s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
                 }
-                ggml_add_or_set(ctx, cgraph, isrc0, tmp);
+                ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
             }
             if (src1_needs_grads) {
                 ggml_add_or_set(ctx, cgraph, isrc1,
@@ -5531,9 +5527,7 @@ static void ggml_compute_backward(
             if (src0_needs_grads) {
                 GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
                 GGML_ASSERT(ggml_is_contiguous(grad));
-                GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
-                ggml_add_or_set(ctx, cgraph, isrc0,
-                    ggml_are_same_shape(tensor, src0) ? grad : ggml_reshape(ctx, grad, src0));
+                ggml_add_or_set(ctx, cgraph, isrc0, grad);
             }
         } break;
         case GGML_OP_RESHAPE: {

Original file line number	Diff line number	Diff line change
`@@ -7921,7 +7921,7 @@ static void ggml_compute_forward_out_prod_f32(`
`7921`	`7921`
`7922`	`7922`	`float * s0 = (float ) ((char ) src0->data + ( i01nb01 + i02nb02 + i03*nb03));`
`7923`	`7923`	`float * s1 = (float ) ((char ) src1->data + (i1nb10 + i11nb11 + i12nb12 + i13nb13));`
`7924`		`- float * d = (float ) ((char ) dst->data + ( i1nb1 + i2nb2 + i3*nb3));`
	`7924`	`+ float * d = (float ) ((char ) dst->data + ( i1nb1 + i2nb2 + i3*nb3));`
`7925`	`7925`
`7926`	`7926`	`ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);`
`7927`	`7927`	`}`
`@@ -7930,7 +7930,7 @@ static void ggml_compute_forward_out_prod_f32(`
`7930`	`7930`
`7931`	`7931`	`float * s0 = (float ) ((char ) src0->data + ( i01nb01 + i02nb02 + i03*nb03));`
`7932`	`7932`	`float * s1 = (float ) ((char ) src1->data + (i1nb10 + i11nb11 + i12nb12 + i13nb13));`
`7933`		`- float * d = (float ) ((char ) dst->data + ( i1nb1 + i2nb2 + i3*nb3));`
	`7933`	`+ float * d = (float ) ((char ) dst->data + ( i1nb1 + i2nb2 + i3*nb3));`
`7934`	`7934`
`7935`	`7935`	`ggml_vec_mad_f32(ne0, d, s0, *s1);`
`7936`	`7936`	`}`
Original file line number	Diff line number	Diff line change
`@@ -3007,7 +3007,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3007`	`3007`	`return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;`
`3008`	`3008`	`} break;`
`3009`	`3009`	`case GGML_OP_REPEAT_BACK:`
`3010`		`- return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);`
	`3010`	`+ return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;`
`3011`	`3011`	`case GGML_OP_CONCAT:`
`3012`	`3012`	`{`
`3013`	`3013`	`ggml_type src0_type = op->src[0]->type;`
Original file line number	Diff line number	Diff line change
`@@ -34,9 +34,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`34`	`34`
`35`	`35`	`CUBLAS_CHECK(cublasSetStream(handle, stream));`
`36`	`36`
`37`		`- const int64_t lda = nb01 / sizeof(float);`
`38`		`- const int64_t ldc = nb1 / sizeof(float);`
`39`		`-`
`40`	`37`	`const bool src1_T = ggml_is_transposed(src1);`
`41`	`38`	`const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;`
`42`	`39`	`const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);`
`@@ -60,9 +57,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`60`	`57`	`CUBLAS_CHECK(`
`61`	`58`	`cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,`
`62`	`59`	`ne0, ne1, ne01,`
`63`		`- &alpha, src0_d + (i3/dps3)s03 + (i2/dps2)s02, lda,`
	`60`	`+ &alpha, src0_d + (i3/dps3)s03 + (i2/dps2)s02, ne00,`
`64`	`61`	`src1_d + i3 s13 + i2 s12, ldb,`
`65`		`- &beta, dst_d + i3 s3 + i2 s2, ldc));`
	`62`	`+ &beta, dst_d + i3 s3 + i2 s2, ne0));`
`66`	`63`	`}`
`67`	`64`	`}`
`68`	`65`	`}`