@@ -42,7 +42,12 @@ The NUMA mirroring system consists of several key components:
4242- ** Thread binding** : GGML threadpool threads are bound to specific NUMA nodes
4343- ** Model weight mirroring** : Complete copies of model weights are created on each NUMA node
4444
45- ### 2. Explicit Model Loading Setup
45+ ### 2. Thread-Local NUMA State Tracking
46+ - ** ` ggml_current_numa_node ` ** : Each OpenMP thread knows which NUMA node it's currently bound to
47+ - ** ` ggml_numa_nodes_active ` ** : Each OpenMP thread knows the total number of active NUMA nodes in the system
48+ - These variables enable efficient tensor data routing and NUMA-aware algorithms
49+
50+ ### 3. Explicit Model Loading Setup
4651Clean integration point during model loading where NUMA mirrors are established for all model weight tensors.
4752
4853## Files Modified
@@ -55,7 +60,7 @@ Clean integration point during model loading where NUMA mirrors are established
5560- NUMA mirror data structures in ` ggml_tensor `
5661- ` tensor_set_data_with_numa_mirrors() ` function declaration
5762- Optimized ` tensor_data() ` function with fast path for non-NUMA tensors
58- - Thread-local variable ` ggml_current_numa_node ` for routing
63+ - Thread-local variables ` ggml_current_numa_node ` and ` ggml_numa_nodes_active ` for routing
5964
6065#### ` ggml/src/ggml.c `
6166** Purpose** : Core tensor operations and NUMA mirror management
@@ -144,8 +149,9 @@ Instead of directly addressing `tensor->data`, there are two new macros instead:
144149``` c
145150 // Tensor data accessor functions for NUMA model mirroring compatibility:
146151
147- // External thread-local variable set at OMP threadpool creation time
152+ // External thread-local variables set at OMP threadpool creation time
148153 extern __thread int ggml_current_numa_node;
154+ extern __thread int ggml_numa_nodes_active;
149155
150156 static inline void * tensor_data (const struct ggml_tensor * tensor) {
151157 // Fast path: if no NUMA mirrors exist, avoid thread-local access entirely
@@ -189,8 +195,9 @@ Instead of directly addressing `tensor->data`, there are two new macros instead:
189195
190196In `ggml-cpu.c`: Thread-local variables at OMP thread-creation time
191197```c
192- // External thread-local variable for NUMA node binding
198+ // External thread-local variables for NUMA node binding
193199extern __thread int ggml_current_numa_node;
200+ extern __thread int ggml_numa_nodes_active;
194201
195202// Thread-local NUMA node assignment for OpenMP threads
196203// Using static initialization to avoid syscalls in hot paths
@@ -223,13 +230,10 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
223230 // Cache strategy check to avoid repeated calls
224231 static bool strategy_checked = false;
225232 static bool is_numa_mirror = false;
226- static int num_numa_nodes = 0 ;
233+ static int num_numa_nodes = 1 ;
227234
228235 if (!strategy_checked) {
229236 is_numa_mirror = (g_state.numa.numa_strategy == GGML_NUMA_STRATEGY_MIRROR);
230- if (is_numa_mirror) {
231- num_numa_nodes = numa_max_node() + 1;
232- }
233237 strategy_checked = true;
234238 }
235239
@@ -243,6 +247,9 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
243247 return;
244248 }
245249
250+ // Set the numa_nodes_active for all threads, regardless of NUMA mode
251+ ggml_numa_nodes_active = numa_max_node() + 1;
252+
246253 // Round-robin assignment of threads to NUMA nodes
247254 int target_numa_node = thread_id % num_numa_nodes;
248255
@@ -277,8 +284,9 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
277284 ggml_thread_numa_node = target_numa_node;
278285 ggml_thread_numa_initialized = true;
279286
280- // Update the global thread-local variable for tensor data access
287+ // Update the global thread-local variables for tensor data access
281288 ggml_current_numa_node = target_numa_node;
289+ ggml_numa_nodes_active = num_numa_nodes;
282290
283291 // Debug output using standard GGML logging
284292 GGML_LOG_DEBUG("NUMA: Bound OpenMP thread %d to NUMA node %d (total threads: %d)\n",
@@ -712,7 +720,7 @@ Future versions may include:
712720- Integrates with all backends (CPU, CUDA, Metal, etc.)
713721
714722### Thread Safety
715- - Thread-local variables ensure safe concurrent access
723+ - Thread-local variables ( ` ggml_current_numa_node ` and ` ggml_numa_nodes_active ` ) ensure safe concurrent access
716724- Model loading is protected by existing llama.cpp synchronization
717725
718726## Troubleshooting
0 commit comments