@@ -349,27 +349,23 @@ static void ggml_compute_forward_add_f32(
349349 const ggml_tensor * src0 , const ggml_tensor * src1 , ggml_tensor * dst ) {
350350 GGML_ASSERT (ggml_can_repeat (src1 , src0 ) && ggml_are_same_shape (src0 , dst ));
351351
352- const int ith = 0 ;
353- const int nth = 1 ;
354-
355352 const int nr = ggml_nrows (src0 );
356353
357354 GGML_TENSOR_BINARY_OP_LOCALS
358355
359356 GGML_ASSERT ( nb0 == sizeof (float ));
360357 GGML_ASSERT (nb00 == sizeof (float ));
361358
362- // rows per thread
363- const int dr = (nr + nth - 1 )/nth ;
359+ const int dr = nr ;
364360
365361 // row range for this thread
366- const int ir0 = dr * ith ;
362+ const int ir0 = 0 ;
367363 const int ir1 = MIN (ir0 + dr , nr );
368364
369365 ggml_dump_tensor (src0 );
370366 ggml_dump_tensor (src1 );
371367
372- #if 1 //naive algorithm, can works with llama-cli
368+ #if 1 //naive algorithm for fp32 , can works with llama-cli
373369 float * a = (float * )src0 -> data ;
374370 float * b = (float * )src1 -> data ;
375371 float * c = (float * )dst -> data ;
@@ -473,9 +469,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(
473469 const int64_t r2 = ne12 / ne02 ;
474470 const int64_t r3 = ne13 / ne03 ;
475471
476- //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
477-
478- // threads with no work simply yield (not sure if it helps)
479472 if (ir0_start >= ir0_end || ir1_start >= ir1_end ) {
480473 return ;
481474 }
@@ -514,20 +507,12 @@ static void ggml_compute_forward_mul_mat_one_chunk(
514507
515508 const char * src0_row = (const char * )src0 -> data + (0 + i02 * nb02 + i03 * nb03 );
516509
517- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
518- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
519- // the original src1 data pointer, so we should index using the indices directly
520- // TODO: this is a bit of a hack, we should probably have a better way to handle this
521510 const char * src1_col = (const char * )wdata +
522511 (src1_cont || src1 -> type != vec_dot_type
523512 ? (i11 + i12 * ne11 + i13 * ne12 * ne11 ) * row_size
524513 : (i11 * nb11 + i12 * nb12 + i13 * nb13 ));
525514 float * dst_col = (float * )((char * )dst -> data + (i1 * nb1 + i2 * nb2 + i3 * nb3 ));
526515
527- //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
528- // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
529- //}
530-
531516 for (int64_t ir0 = iir0 ; ir0 < iir0 + blck_0 && ir0 < ir0_end ; ir0 += num_rows_per_vec_dot ) {
532517 vec_dot (ne00 , & tmp [ir0 - iir0 ], (num_rows_per_vec_dot > 1 ? 16 : 0 ), src0_row + ir0 * nb01 , (num_rows_per_vec_dot > 1 ? nb01 : 0 ), src1_col , (num_rows_per_vec_dot > 1 ? src1_col_stride : 0 ), num_rows_per_vec_dot );
533518 }
@@ -574,7 +559,7 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
574559 int M = src0 -> ne [1 ];
575560 int K = src0 -> ne [0 ];
576561 int N = src1 -> ne [1 ];
577- float sum = 0 ;
562+ float sum = 0 ;
578563 for (int i = 0 ; i < M ; i ++ ) {
579564 for (int j = 0 ; j < N ; j ++ ) {
580565 sum = 0 ;
0 commit comments