update instructions

dbsanfte · dbsanfte · commit 6ad679528fb9 · 2025-09-17T21:04:43.000Z
diff --git a/.github/instructions/numa-mirroring-implementation.instructions.md b/.github/instructions/numa-mirroring-implementation.instructions.md
@@ -42,7 +42,12 @@ The NUMA mirroring system consists of several key components:
 - **Thread binding**: GGML threadpool threads are bound to specific NUMA nodes
 - **Model weight mirroring**: Complete copies of model weights are created on each NUMA node
 
-### 2. Explicit Model Loading Setup
+### 2. Thread-Local NUMA State Tracking
+- **`ggml_current_numa_node`**: Each OpenMP thread knows which NUMA node it's currently bound to
+- **`ggml_numa_nodes_active`**: Each OpenMP thread knows the total number of active NUMA nodes in the system
+- These variables enable efficient tensor data routing and NUMA-aware algorithms
+
+### 3. Explicit Model Loading Setup
 Clean integration point during model loading where NUMA mirrors are established for all model weight tensors.
 
 ## Files Modified
@@ -55,7 +60,7 @@ Clean integration point during model loading where NUMA mirrors are established
 - NUMA mirror data structures in `ggml_tensor`
 - `tensor_set_data_with_numa_mirrors()` function declaration
 - Optimized `tensor_data()` function with fast path for non-NUMA tensors
-- Thread-local variable `ggml_current_numa_node` for routing
+- Thread-local variables `ggml_current_numa_node` and `ggml_numa_nodes_active` for routing
 
 #### `ggml/src/ggml.c`
 **Purpose**: Core tensor operations and NUMA mirror management
@@ -144,8 +149,9 @@ Instead of directly addressing `tensor->data`, there are two new macros instead:
 ```c
     // Tensor data accessor functions for NUMA model mirroring compatibility:
     
-    // External thread-local variable set at OMP threadpool creation time
+    // External thread-local variables set at OMP threadpool creation time
     extern __thread int ggml_current_numa_node;
+    extern __thread int ggml_numa_nodes_active;
     
     static inline void * tensor_data(const struct ggml_tensor * tensor) {
         // Fast path: if no NUMA mirrors exist, avoid thread-local access entirely
@@ -189,8 +195,9 @@ Instead of directly addressing `tensor->data`, there are two new macros instead:
 
 In `ggml-cpu.c`: Thread-local variables at OMP thread-creation time
 ```c
-// External thread-local variable for NUMA node binding
+// External thread-local variables for NUMA node binding
 extern __thread int ggml_current_numa_node;
+extern __thread int ggml_numa_nodes_active;
 
 // Thread-local NUMA node assignment for OpenMP threads  
 // Using static initialization to avoid syscalls in hot paths
@@ -223,13 +230,10 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
     // Cache strategy check to avoid repeated calls
     static bool strategy_checked = false;
     static bool is_numa_mirror = false;
-    static int num_numa_nodes = 0;
+    static int num_numa_nodes = 1;
     
     if (!strategy_checked) {
         is_numa_mirror = (g_state.numa.numa_strategy == GGML_NUMA_STRATEGY_MIRROR);
-        if (is_numa_mirror) {
-            num_numa_nodes = numa_max_node() + 1;
-        }
         strategy_checked = true;
     }
     
@@ -243,6 +247,9 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
         return;
     }
 
+    // Set the numa_nodes_active for all threads, regardless of NUMA mode
+    ggml_numa_nodes_active = numa_max_node() + 1;
+
     // Round-robin assignment of threads to NUMA nodes
     int target_numa_node = thread_id % num_numa_nodes;
     
@@ -277,8 +284,9 @@ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
             ggml_thread_numa_node = target_numa_node;
             ggml_thread_numa_initialized = true;
             
-            // Update the global thread-local variable for tensor data access
+            // Update the global thread-local variables for tensor data access
             ggml_current_numa_node = target_numa_node;
+            ggml_numa_nodes_active = num_numa_nodes;
             
             // Debug output using standard GGML logging
             GGML_LOG_DEBUG("NUMA: Bound OpenMP thread %d to NUMA node %d (total threads: %d)\n", 
@@ -712,7 +720,7 @@ Future versions may include:
 - Integrates with all backends (CPU, CUDA, Metal, etc.)
 
 ### Thread Safety
-- Thread-local variables ensure safe concurrent access
+- Thread-local variables (`ggml_current_numa_node` and `ggml_numa_nodes_active`) ensure safe concurrent access
 - Model loading is protected by existing llama.cpp synchronization
 
 ## Troubleshooting