33
44#include  "ggml-backend-impl.h" 
55#include  "ggml-backend.h" 
6+ #include  "ggml-cpu-aarch64.h" 
67#include  "ggml-cpu-traits.h" 
78#include  "ggml-cpu-impl.h" 
89#include  "ggml-cpu.h" 
@@ -118,7 +119,6 @@ struct ggml_arm_arch_features_type {
118119} ggml_arm_arch_features  =  {-1 , -1 , -1 , -1 , 0 , -1 };
119120#endif 
120121
121- 
122122#if  defined(_WIN32 )
123123
124124#define  WIN32_LEAN_AND_MEAN 
@@ -1385,6 +1385,9 @@ struct ggml_threadpool {
13851385    struct  ggml_compute_state  *  workers ;   // per thread state 
13861386    int           n_threads_max ; // number of threads in the pool 
13871387    atomic_int    n_threads_cur ; // number of threads used in the current graph 
1388+ #if  defined(GGML_YIELD_BARRIER )
1389+     size_t  n_barrier_spin_count ;
1390+ #endif 
13881391
13891392    int32_t       prio ;        // Scheduling priority 
13901393    uint32_t      poll ;        // Polling level (0 - no polling) 
@@ -2450,6 +2453,63 @@ struct ggml_state {
24502453
24512454static  struct  ggml_state  g_state  =  {0 };
24522455
2456+ #if  defined(__gnu_linux__ ) ||  defined(__ANDROID__ )
2457+ #include  <sys/syscall.h> 
2458+ #define  FUTEX_WAIT  0
2459+ #define  FUTEX_WAKE  1
2460+ #define  FUTEX_PRIVATE_FLAG    128
2461+ #define  FUTEX_WAIT_PRIVATE  (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
2462+ #define  FUTEX_WAKE_PRIVATE  (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
2463+ #define  futex_wait (uaddr , val ) syscall(SYS_futex, uaddr, FUTEX_WAIT_PRIVATE, val, NULL, NULL, 0)
2464+ #define  futex_wake (uaddr , n )   syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, n, NULL, NULL, 0)
2465+ #elif  defined(__APPLE__ )
2466+ #include  <stdatomic.h> 
2467+ 
2468+ extern  int  __ulock_wait (uint32_t  operation , volatile  int  * addr , uint64_t  value , uint32_t  timeout );
2469+ extern  int  __ulock_wake (uint32_t  operation , volatile  int  * addr , uint64_t  wake_value );
2470+ 
2471+ #define  UL_COMPARE_AND_WAIT         1
2472+ 
2473+ #define  ULF_WAKE_ALL     0x00000100
2474+ #define  ULF_WAKE_THREAD  0x00000200
2475+ 
2476+ static  int  futex_wait (volatile  int  * addr , int  expected ) {
2477+     int  op  =  UL_COMPARE_AND_WAIT ;
2478+     int  ret  =  __ulock_wait (op , (void  * )addr , (uint64_t )expected , 0 );
2479+     if  (ret  ==  -1 ) {
2480+         return  -1 ;
2481+     }
2482+     return  0 ;
2483+ }
2484+ 
2485+ static  int  futex_wake (volatile  int  * addr , int  count ) {
2486+     if  (count  <= 0 ) {
2487+         return  0 ;
2488+     }
2489+     uint32_t  op  =  UL_COMPARE_AND_WAIT ;
2490+     if  (count  ==  INT_MAX ) {
2491+         op  |= ULF_WAKE_ALL ;
2492+         if  (__ulock_wake (op , (void  * )addr , 0 ) ==  -1 ) {
2493+             return  -1 ;
2494+         }
2495+         return  0 ;
2496+     }
2497+     int  woken  =  0 ;
2498+     for  (int  i  =  0 ; i  <  count ; ++ i ) {
2499+         if  (__ulock_wake (op , (void  * )addr , 0 ) ==  -1 ) {
2500+             if  (errno  ==  ENOENT  ||  errno  ==  ESRCH ) {
2501+                 break ;
2502+             } else  {
2503+                 return  -1 ;
2504+             }
2505+         }
2506+         woken ++ ;
2507+     }
2508+     return  woken ;
2509+ }
2510+ 
2511+ #endif 
2512+ 
24532513void  ggml_barrier (struct  ggml_threadpool  *  tp ) {
24542514    int  n_threads  =  atomic_load_explicit (& tp -> n_threads_cur , memory_order_relaxed );
24552515    if  (n_threads  ==  1 ) {
@@ -2470,14 +2530,34 @@ void ggml_barrier(struct ggml_threadpool * tp) {
24702530
24712531        // exit barrier (fill seq-cst fence) 
24722532        atomic_fetch_add_explicit (& tp -> n_barrier_passed , 1 , memory_order_seq_cst );
2533+ #if  defined(GGML_YIELD_BARRIER )
2534+         // wake up all threads 
2535+         futex_wake (& tp -> n_barrier_passed , INT_MAX );
2536+ #endif 
24732537        return ;
24742538    }
24752539
2540+ #if  !defined(GGML_YIELD_BARRIER )
24762541    // wait for other threads 
24772542    while  (atomic_load_explicit (& tp -> n_barrier_passed , memory_order_relaxed ) ==  n_passed ) {
24782543        ggml_thread_cpu_relax ();
24792544    }
2545+ #else 
2546+     size_t  spin_count  =  tp -> n_barrier_spin_count ;
2547+     size_t  i ;
2548+     do  {
2549+         for  (i  =  0 ; i  <  spin_count ; i ++ ) {
2550+             if  (atomic_load_explicit (& tp -> n_barrier_passed , memory_order_relaxed ) !=  n_passed ) {
2551+                 goto exit_barrier ;
2552+             }
2553+             ggml_thread_cpu_relax ();
2554+         }
24802555
2556+         futex_wait (& tp -> n_barrier_passed , n_passed );
2557+     } while  (atomic_load_explicit (& tp -> n_barrier_passed , memory_order_relaxed ) ==  n_passed );
2558+     return ;
2559+ exit_barrier :
2560+ #endif 
24812561    // exit barrier (full seq-cst fence) 
24822562    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead 
24832563    #ifdef  GGML_TSAN_ENABLED 
@@ -13126,7 +13206,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
1312613206
1312713207    for  (uint32_t  i  =  0 ; i  <  GGML_MAX_N_THREADS ; i ++ ) {
1312813208        if  (mask [i ]) {
13129-             GGML_PRINT_DEBUG ("Thread %lx: adding %d to cpuset\n" , pthread_self (), i );
13209+             printf ("Thread %lx: adding %d to cpuset\n" , pthread_self (), i );
1313013210            CPU_SET (i , & cpuset );
1313113211        }
1313213212    }
@@ -13680,6 +13760,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
1368013760        threadpool -> poll              =  tpp -> poll ;
1368113761        threadpool -> prio              =  tpp -> prio ;
1368213762        threadpool -> ec                =  GGML_STATUS_SUCCESS ;
13763+ #if  defined(GGML_YIELD_BARRIER )
13764+         threadpool -> n_barrier_spin_count  =  ggml_barrier_spin_count (tpp -> n_threads );
13765+ #endif 
1368313766    }
1368413767
1368513768    // Allocate and init workers state 
0 commit comments