@@ -3177,55 +3177,36 @@ inline static void ggml_critical_section_start(void) {
31773177    }
31783178}
31793179
3180- #ifdef GGML_USE_OPENMP
3181- static void ggml_barrier(struct ggml_threadpool * threadpool) {
3182-     int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3180+ static void ggml_barrier(struct ggml_threadpool * tp) {
3181+     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
31833182    if (n_threads == 1) {
31843183        return;
31853184    }
31863185
3186+ #ifdef GGML_USE_OPENMP
31873187    #pragma omp barrier
3188- }
31893188#else
3190- static void ggml_barrier(struct ggml_threadpool * threadpool) {
3191-     int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
3192-     if (n_threads == 1) {
3193-         return;
3194-     }
3189+     int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
31953190
3196-     atomic_int * n_barrier = &threadpool->n_barrier; 
3197-     atomic_int * n_barrier_passed = &threadpool->n_barrier_passed ;
3191+     // enter barrier (full seq-cst fence) 
3192+     int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst) ;
31983193
3199-     int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
3200- 
3201-     // All threads go through the full fence (memory barrier) operation once to ensure
3202-     // that all previos updates have completed.
3203-     // The rest of the reads and writes can be relaxed, but the thread sanitizer wants
3204-     // to see an explicit acquire / release sequence to declare all futher accesses
3205-     // as safe.
3206- 
3207-     memory_order passed_acquire = memory_order_relaxed;
3208-     memory_order passed_release = memory_order_relaxed;
3209- 
3210-     #if defined(__has_feature)
3211-     #if __has_feature(thread_sanitizer)
3212-     passed_acquire = memory_order_acquire;
3213-     passed_release = memory_order_release;
3214-     #endif
3215-     #endif
3216- 
3217-     if (atomic_fetch_add_explicit(n_barrier, 1, memory_order_seq_cst) == n_threads - 1) {
3194+     int last = 0;
3195+     if (n_barrier == (n_threads - 1)) {
32183196        // last thread
3219-         atomic_store_explicit(n_barrier, 0, memory_order_relaxed);
3220-         atomic_fetch_add_explicit(n_barrier_passed, 1, passed_release) ;
3197+         atomic_store_explicit(&tp-> n_barrier, 0, memory_order_relaxed);
3198+         last = 1 ;
32213199    } else {
32223200        // wait for other threads
3223-         while (atomic_load_explicit(n_barrier_passed, passed_acquire ) == passed_old ) {
3201+         while (atomic_load_explicit(&tp-> n_barrier_passed, memory_order_relaxed ) == n_passed ) {
32243202            ggml_thread_cpu_relax();
32253203        }
32263204    }
3227- }
3205+ 
3206+     // exit barrier (full seq-cst fence)
3207+     atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
32283208#endif
3209+ }
32293210
32303211// TODO: make this somehow automatically executed
32313212//       some sort of "sentry" mechanism
0 commit comments