@@ -130,11 +130,13 @@ cmake --build build --parallel
130130## Implementation Details
131131
132132### Tensor Data Access Optimization
133- The ` ggml_tensor ` struct in ` ggml.h ` has been updated to no longer have a ` data ` field. This has been renamed to a ` __data[] ` array to hold pointers to multiple memory locations, with the index corresponding to the index of a local Numa node.
134133
135- Instead of directly addressing ` tensor->data ` , instead you do ` tensor_data(tensor) ` . And setting is done with ` tensor_set_data() ` . These are two new macros in ` ggml.h ` .
136134
137- The ` tensor_data() ` function in ` ggml.h ` has been optimized with a fast path:
135+ In ` ggml.h ` :
136+
137+ The ` ggml_tensor ` struct no longer has a ` data ` field. This has been renamed to a ` __data[] ` array to hold pointers to multiple memory locations, with the index corresponding to the index of a local Numa node.
138+
139+ Instead of directly addressing ` tensor->data ` , there are two new macros instead: ` tensor_data(tensor) ` for getting, and setting is done with ` tensor_set_data() ` . The ` tensor_data() ` function in ` ggml.h ` has been optimized with a fast path.
138140``` c
139141 // Tensor data accessor functions for NUMA model mirroring compatibility:
140142
@@ -181,13 +183,112 @@ The `tensor_data()` function in `ggml.h` has been optimized with a fast path:
181183 }
182184```
183185
184- Thread-local variables at OMP thread-creation time in ggml-cpu.c:
186+ In `ggml-cpu.c`: Thread-local variables at OMP thread-creation time
185187```c
188+ // External thread-local variable for NUMA node binding
189+ extern __thread int ggml_current_numa_node;
190+
191+ // Thread-local NUMA node assignment for OpenMP threads
192+ // Using static initialization to avoid syscalls in hot paths
193+ static __thread int ggml_thread_numa_node = -1;
194+ static __thread bool ggml_thread_numa_initialized = false;
195+ ```
186196
197+ In ` ggml-cpu.c ` : Bind an OMP thread to its Numa node at creation time
198+ ``` c
199+ if (n_threads > 1 ) {
200+ #pragma omp parallel num_threads(n_threads)
201+ {
202+ // Bind OpenMP threads to NUMA nodes in round-robin fashion
203+ // This must be done early in the parallel region before any work
204+ ggml_openmp_bind_thread_to_numa_node (omp_get_thread_num (), omp_get_num_threads());
187205```
188206
189- First-touch allocation at model weight loading time in llama-mmap.cpp:
207+ In `ggml-cpu.c`: Numa detection and binding logic
190208```c
209+ bool ggml_is_numa(void) {
210+ // Return true if:
211+ // 1. Multiple physical NUMA nodes are present, OR
212+ // 2. User explicitly requested NUMA mirror strategy (--numa mirror)
213+ return g_state.numa.n_nodes > 1 ||
214+ g_state.numa.numa_strategy == GGML_NUMA_STRATEGY_MIRROR;
215+ }
216+
217+ // Static caching for NUMA thread binding to avoid syscalls in hot OpenMP paths
218+ static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
219+ // Cache strategy check to avoid repeated calls
220+ static bool strategy_checked = false;
221+ static bool is_numa_mirror = false;
222+ static int num_numa_nodes = 0;
223+
224+ if (!strategy_checked) {
225+ is_numa_mirror = (g_state.numa.numa_strategy == GGML_NUMA_STRATEGY_MIRROR);
226+ if (is_numa_mirror) {
227+ num_numa_nodes = numa_max_node() + 1;
228+ }
229+ strategy_checked = true;
230+ }
231+
232+ // Only apply binding in NUMA mirror mode with multiple nodes
233+ if (!is_numa_mirror || num_numa_nodes <= 1) {
234+ return;
235+ }
236+
237+ // Check if this thread is already initialized to avoid repeated binding
238+ if (ggml_thread_numa_initialized) {
239+ return;
240+ }
241+
242+ // Round-robin assignment of threads to NUMA nodes
243+ int target_numa_node = thread_id % num_numa_nodes;
244+
245+ // Cache CPU masks statically to avoid repeated numa_allocate_cpumask() calls
246+ static struct bitmask *node_cpumasks[GGML_NUMA_MAX_NODES] = {0};
247+ static bool cpumasks_initialized = false;
248+ static cpu_set_t node_cpusets[GGML_NUMA_MAX_NODES];
249+ static bool cpusets_valid[GGML_NUMA_MAX_NODES] = {0};
250+
251+ if (!cpumasks_initialized) {
252+ for (int node = 0; node < num_numa_nodes && node < GGML_NUMA_MAX_NODES; node++) {
253+ node_cpumasks[node] = numa_allocate_cpumask();
254+ if (node_cpumasks[node] && numa_node_to_cpus(node, node_cpumasks[node]) == 0) {
255+ // Convert NUMA bitmask to cpu_set_t for faster thread binding
256+ CPU_ZERO(&node_cpusets[node]);
257+ for (int cpu = 0; cpu < numa_num_possible_cpus(); cpu++) {
258+ if (numa_bitmask_isbitset(node_cpumasks[node], cpu)) {
259+ CPU_SET(cpu, &node_cpusets[node]);
260+ }
261+ }
262+ cpusets_valid[node] = true;
263+ }
264+ }
265+ cpumasks_initialized = true;
266+ }
267+
268+ // Bind thread if we have a valid CPU set for the target node
269+ if (target_numa_node < GGML_NUMA_MAX_NODES && cpusets_valid[target_numa_node]) {
270+ if (sched_setaffinity(0, sizeof(cpu_set_t), &node_cpusets[target_numa_node]) == 0) {
271+ // Set memory allocation preference and thread-local node assignment
272+ numa_set_preferred(target_numa_node);
273+ ggml_thread_numa_node = target_numa_node;
274+ ggml_thread_numa_initialized = true;
275+
276+ // Update the global thread-local variable for tensor data access
277+ ggml_current_numa_node = target_numa_node;
278+
279+ // Debug output using standard GGML logging
280+ GGML_LOG_DEBUG("NUMA: Bound OpenMP thread %d to NUMA node %d (total threads: %d)\n",
281+ thread_id, target_numa_node, n_threads);
282+ }
283+ }
284+ }
285+ ```
286+
287+ In ` llama-mmap.cpp ` : First-touch allocation at model weight loading time
288+ ``` c
289+ struct llama_mmap::impl {
290+ #ifdef _POSIX_MAPPED_FILES
291+ std::vector<std::pair<size_t, size_t>> mapped_fragments;
191292 // NUMA mirror logic: allocate and populate model weights on each NUMA node
192293 struct numa_mapping {
193294 void* addr;
@@ -207,15 +308,15 @@ First-touch allocation at model weight loading time in llama-mmap.cpp:
207308 // Bind current thread to the target NUMA node for first-touch
208309 struct bitmask* old_mask = numa_get_run_node_mask();
209310 if (numa_run_on_node(node) != 0) {
210- LLAMA_LOG_DEBUG ("Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
311+ LLAMA_LOG_DEBUG ("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
211312 // Continue anyway - might still work
212313 }
213314
214315 // Use posix_memalign for SIMD alignment
215316 void* ptr = nullptr;
216317 int ret = posix_memalign(&ptr, alignment, size);
217318 if (ret != 0) {
218- LLAMA_LOG_DEBUG("posix_memalign failed for %zu bytes with alignment %zu: %s\n",
319+ LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n",
219320 size, alignment, strerror(ret));
220321 // Restore original thread binding
221322 if (old_mask) {
@@ -238,23 +339,23 @@ First-touch allocation at model weight loading time in llama-mmap.cpp:
238339 numa_free_nodemask(old_mask);
239340 }
240341
241- LLAMA_LOG_DEBUG("✅ First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
342+ LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
242343 size, node, ptr, alignment);
243344 return ptr;
244345 }
245346
246347 void mmap_numa_mirror(struct llama_file * file) {
247348 int num_nodes = numa_num_configured_nodes();
248349 if (num_nodes <= 1) {
249- throw std::runtime_error("NUMA mirror mode requires multiple NUMA nodes");
350+ throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
250351 }
251352
252- LLAMA_LOG_DEBUG ("NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
353+ LLAMA_LOG_DEBUG ("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
253354 file->size() / (1024.0 * 1024.0), num_nodes);
254355
255356 size_t total_size = file->size();
256357 for (int node = 0; node < num_nodes; ++node) {
257- LLAMA_LOG_DEBUG("NUMA: Allocating on node %d using first-touch approach\n", node);
358+ LLAMA_LOG_DEBUG("NUMA MIRRORING : Allocating on node %d using first-touch approach\n", node);
258359
259360 void* node_mem = numa_alloc_first_touch(total_size, node);
260361 if (!node_mem) {
@@ -267,24 +368,24 @@ First-touch allocation at model weight loading time in llama-mmap.cpp:
267368 // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
268369 int actual_node = -1;
269370 if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
270- LLAMA_LOG_DEBUG("NUMA: Memory at %p allocated on node %d (expected %d)\n",
371+ LLAMA_LOG_DEBUG("NUMA MIRRORING : Memory at %p allocated on node %d (expected %d)\n",
271372 node_mem, actual_node, node);
272373 if (actual_node != node) {
273- LLAMA_LOG_WARN("NUMA: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
374+ LLAMA_LOG_WARN("NUMA MIRRORING : WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
274375 node, actual_node);
275376 } else {
276- LLAMA_LOG_DEBUG("NUMA: ✅ First-touch succeeded - memory correctly placed on node %d\n", node);
377+ LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
277378 }
278379 } else {
279- LLAMA_LOG_WARN("NUMA: Could not verify allocation node for %p: %s\n",
380+ LLAMA_LOG_WARN("NUMA MIRRORING : Could not verify allocation node for %p: %s\n",
280381 node_mem, strerror(errno));
281382 }
282383
283384 file->seek(0, SEEK_SET);
284385 file->read_raw(node_mem, total_size);
285386 numa_mappings.push_back({node_mem, total_size});
286-
287- LLAMA_LOG_DEBUG("NUMA: Successfully allocated and populated %.2f MB on node %d at %p\n",
387+
388+ LLAMA_LOG_DEBUG ("NUMA MIRRORING : Successfully allocated and populated %.2f MB on node %d at %p\n",
288389 total_size / (1024.0 * 1024.0), node, node_mem);
289390 }
290391 addr = numa_mappings.empty() ? nullptr : numa_mappings[ 0] .addr;
0 commit comments