@@ -1297,7 +1297,11 @@ struct ggml_threadpool {
12971297 atomic_int n_graph ; // incremented when there is work to be done (i.e each graph)
12981298 atomic_int GGML_CACHE_ALIGN n_barrier ;
12991299 atomic_int GGML_CACHE_ALIGN n_barrier_passed ;
1300- atomic_int current_chunk ; // currently processing chunk during Mat_Mul, shared between all the threads.
1300+ atomic_int GGML_CACHE_ALIGN current_chunk ; // currently processing chunk during Mat_Mul, shared between all the threads.
1301+
1302+ // store each counter in a separate cache line
1303+ char GGML_CACHE_ALIGN current_chunks [256 ][GGML_CACHE_LINE ];
1304+ //atomic_int GGML_CACHE_ALIGN current_chunks[256];
13011305
13021306 // these are atomic as an annotation for thread-sanitizer
13031307 atomic_bool stop ; // Used for stopping the threadpool altogether
@@ -7744,7 +7748,8 @@ static void ggml_compute_forward_mul_mat_id(
77447748
77457749 if (ith == 0 ) {
77467750 // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7747- atomic_store_explicit (& params -> threadpool -> current_chunk , nth , memory_order_relaxed );
7751+ //atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed);
7752+ //memset(params->threadpool->current_chunks, 0, n_as*sizeof(params->threadpool->current_chunks[0]));
77487753
77497754 // initialize matrix_row_counts
77507755 memset (matrix_row_counts , 0 , n_as * sizeof (int64_t ));
@@ -7760,12 +7765,18 @@ static void ggml_compute_forward_mul_mat_id(
77607765 matrix_row_counts [i02 ] += 1 ;
77617766 }
77627767 }
7768+ } else {
7769+ // reset current_chunk
7770+ for (int cur_a = ith ; cur_a < n_as ; cur_a += nth ) {
7771+ atomic_int * current_chunk_ctr = (atomic_int * )(params -> threadpool -> current_chunks + cur_a );
7772+ atomic_store_explicit (current_chunk_ctr , nth , memory_order_relaxed );
7773+ }
77637774 }
77647775
77657776 ggml_barrier (params -> threadpool );
77667777
7767- const int64_t rows_total = ggml_nelements (ids );
7768- int64_t rows_processed = 0 ;
7778+ // const int64_t rows_total = ggml_nelements(ids);
7779+ // int64_t rows_processed = 0;
77697780
77707781 for (int cur_a = 0 ; cur_a < n_as ; ++ cur_a ) {
77717782 const int64_t cne1 = matrix_row_counts [cur_a ];
@@ -7774,7 +7785,7 @@ static void ggml_compute_forward_mul_mat_id(
77747785 continue ;
77757786 }
77767787
7777- rows_processed += cne1 ;
7788+ // rows_processed += cne1;
77787789
77797790 const char * src0_cur = (const char * ) src0 -> data + cur_a * nb02 ;
77807791 const void * wdata = (src1 -> type == vec_dot_type ) ? src1 -> data : params -> wdata ;
@@ -7801,6 +7812,8 @@ static void ggml_compute_forward_mul_mat_id(
78017812
78027813 int current_chunk = ith ;
78037814
7815+ atomic_int * current_chunk_ctr = (atomic_int * )(params -> threadpool -> current_chunks + cur_a );
7816+
78047817 while (current_chunk < nchunk0 * nchunk1 ) {
78057818 const int64_t ith0 = current_chunk % nchunk0 ;
78067819 const int64_t ith1 = current_chunk / nchunk0 ;
@@ -7821,21 +7834,21 @@ static void ggml_compute_forward_mul_mat_id(
78217834 break ;
78227835 }
78237836
7824- current_chunk = atomic_fetch_add_explicit (& params -> threadpool -> current_chunk , 1 , memory_order_relaxed );
7837+ current_chunk = atomic_fetch_add_explicit (current_chunk_ctr , 1 , memory_order_relaxed );
78257838 }
78267839
7827- if (rows_processed == rows_total ) {
7828- break ;
7829- }
7840+ // if (rows_processed == rows_total) {
7841+ // break;
7842+ // }
78307843
7831- ggml_barrier (params -> threadpool );
7844+ // ggml_barrier(params->threadpool);
78327845
7833- if (ith == 0 ) {
7834- // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7835- atomic_store_explicit (& params -> threadpool -> current_chunk , nth , memory_order_relaxed );
7836- }
7846+ // if (ith == 0) {
7847+ // // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
7848+ // atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed);
7849+ // }
78377850
7838- ggml_barrier (params -> threadpool );
7851+ // ggml_barrier(params->threadpool);
78397852 }
78407853}
78417854
@@ -14079,6 +14092,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
1407914092 threadpool -> n_barrier = 0 ;
1408014093 threadpool -> n_barrier_passed = 0 ;
1408114094 threadpool -> current_chunk = 0 ;
14095+ memset (threadpool -> current_chunks , 0 , sizeof (threadpool -> current_chunks ));
1408214096 threadpool -> stop = false;
1408314097 threadpool -> pause = tpp -> paused ;
1408414098 threadpool -> abort = -1 ;
@@ -14160,6 +14174,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1416014174 threadpool -> cgraph = cgraph ;
1416114175 threadpool -> cplan = cplan ;
1416214176 threadpool -> current_chunk = 0 ;
14177+ memset (threadpool -> current_chunks , 0 , sizeof (threadpool -> current_chunks ));
1416314178 threadpool -> abort = -1 ;
1416414179 threadpool -> ec = GGML_STATUS_SUCCESS ;
1416514180 }
0 commit comments