threadpool: further simplify and improve ggml_barrier

max-krasnyansky · max-krasnyansky · commit 71af4ab2f68c · 2024-09-16T01:26:53.000-07:00
Avoid using strict memory order, yet make sure that all threads go through
full memory barrier (memory fence) on barrier entrace and exit.

We no longer need thread-sanitizer hacks.
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -3177,55 +3177,36 @@ inline static void ggml_critical_section_start(void) {
     }
 }
 
-#ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_threadpool * threadpool) {
-    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
+static void ggml_barrier(struct ggml_threadpool * tp) {
+    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
     if (n_threads == 1) {
         return;
     }
 
+#ifdef GGML_USE_OPENMP
     #pragma omp barrier
-}
 #else
-static void ggml_barrier(struct ggml_threadpool * threadpool) {
-    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
-    if (n_threads == 1) {
-        return;
-    }
+    int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
 
-    atomic_int * n_barrier = &threadpool->n_barrier;
-    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
+    // enter barrier (full seq-cst fence)
+    int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
 
-    int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
-
-    // All threads go through the full fence (memory barrier) operation once to ensure
-    // that all previos updates have completed.
-    // The rest of the reads and writes can be relaxed, but the thread sanitizer wants
-    // to see an explicit acquire / release sequence to declare all futher accesses
-    // as safe.
-
-    memory_order passed_acquire = memory_order_relaxed;
-    memory_order passed_release = memory_order_relaxed;
-
-    #if defined(__has_feature)
-    #if __has_feature(thread_sanitizer)
-    passed_acquire = memory_order_acquire;
-    passed_release = memory_order_release;
-    #endif
-    #endif
-
-    if (atomic_fetch_add_explicit(n_barrier, 1, memory_order_seq_cst) == n_threads - 1) {
+    int last = 0;
+    if (n_barrier == (n_threads - 1)) {
         // last thread
-        atomic_store_explicit(n_barrier, 0, memory_order_relaxed);
-        atomic_fetch_add_explicit(n_barrier_passed, 1, passed_release);
+        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
+        last = 1;
     } else {
         // wait for other threads
-        while (atomic_load_explicit(n_barrier_passed, passed_acquire) == passed_old) {
+        while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
             ggml_thread_cpu_relax();
         }
     }
-}
+
+    // exit barrier (full seq-cst fence)
+    atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
 #endif
+}
 
 // TODO: make this somehow automatically executed
 //       some sort of "sentry" mechanism