@@ -3111,7 +3111,7 @@ static void ggml_compute_forward_dup_same_cont(
31113111 const int nth = params->nth; // number of threads
31123112
31133113 // parallelize by elements
3114- const int ne = ggml_nelements(dst );
3114+ const int ne = ggml_nelements(src0)/ggml_blck_size(src0->type );
31153115 const int dr = (ne + nth - 1) / nth;
31163116 const int ie0 = dr * ith;
31173117 const int ie1 = MIN(ie0 + dr, ne);
@@ -4055,7 +4055,6 @@ static void ggml_compute_forward_dup_f32(
40554055static void ggml_compute_forward_dup_bytes(
40564056 const struct ggml_compute_params * params,
40574057 struct ggml_tensor * dst) {
4058-
40594058 const struct ggml_tensor * src0 = dst->src[0];
40604059
40614060 GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -4069,10 +4068,10 @@ static void ggml_compute_forward_dup_bytes(
40694068 }
40704069
40714070 const size_t type_size = ggml_type_size(src0->type);
4071+
40724072 const int ith = params->ith; // thread index
40734073 const int nth = params->nth; // number of threads
40744074
4075-
40764075 // parallelize by rows
40774076 const int nr = ne01;
40784077 // number of rows per thread
@@ -4082,10 +4081,10 @@ static void ggml_compute_forward_dup_bytes(
40824081 const int ir1 = MIN(ir0 + dr, nr);
40834082
40844083 if (src0->type == dst->type &&
4085- ne00 == ne0 &&
4084+ ggml_are_same_shape(src0, dst) &&
40864085 nb00 == type_size && nb0 == type_size) {
40874086 // copy by rows
4088- const size_t rs = ne00 * type_size ;
4087+ const size_t rs = ggml_row_size(src0->type, ne00) ;
40894088 for (int64_t i03 = 0; i03 < ne03; i03++) {
40904089 for (int64_t i02 = 0; i02 < ne02; i02++) {
40914090 for (int64_t i01 = ir0; i01 < ir1; i01++) {
@@ -4146,9 +4145,12 @@ static void ggml_compute_forward_dup_bytes(
41464145 int64_t i12 = 0;
41474146 int64_t i13 = 0;
41484147
4148+ // number of blocks in a row
4149+ const int64_t nb = ne00/ggml_blck_size(src0->type);
4150+
41494151 for (int64_t i03 = 0; i03 < ne03; i03++) {
41504152 for (int64_t i02 = 0; i02 < ne02; i02++) {
4151- i10 += ne00 * ir0;
4153+ i10 += nb * ir0;
41524154 while (i10 >= ne0) {
41534155 i10 -= ne0;
41544156 if (++i11 == ne1) {
@@ -4162,7 +4164,7 @@ static void ggml_compute_forward_dup_bytes(
41624164 }
41634165 }
41644166 for (int64_t i01 = ir0; i01 < ir1; i01++) {
4165- for (int64_t i00 = 0; i00 < ne00 ; i00++) {
4167+ for (int64_t i00 = 0; i00 < nb ; i00++) {
41664168 const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
41674169 char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
41684170
@@ -4182,7 +4184,7 @@ static void ggml_compute_forward_dup_bytes(
41824184 }
41834185 }
41844186 }
4185- i10 += ne00 * (ne01 - ir1);
4187+ i10 += nb * (ne01 - ir1);
41864188 while (i10 >= ne0) {
41874189 i10 -= ne0;
41884190 if (++i11 == ne1) {
@@ -14067,7 +14069,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1406714069 }
1406814070
1406914071 // extra_buffer op?
14070- if (ggml_cpu_extra_compute_forward(params, tensor)) return;
14072+ if (ggml_cpu_extra_compute_forward(params, tensor)) {
14073+ return;
14074+ }
1407114075
1407214076 switch (tensor->op) {
1407314077 case GGML_OP_DUP:
0 commit comments