@@ -19,14 +19,14 @@ developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$
1919
2020With numa mirroring:
2121```
22- build: dccea3c5 (6465)
23- developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$ cd /workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror && ./build-release/bin/llama-bench -m .. /.devcontainer/Qwen3-32B-Q6_K.gguf --numa mirror
22+ developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev$ ./build/bin/llama-bench -m .
23+ /.devcontainer/Qwen3-32B-Q6_K.gguf --numa mirror
2424| model | size | params | backend | threads | test | t/s |
2525| ------------------------------ | ---------: | ---------: | ---------- | ------: | --------------: | -------------------: |
26- | qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | pp512 | 16.22 ± 0.30 |
27- | qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | tg128 | 2.80 ± 0.00 |
26+ | qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | pp512 | 21.36 ± 0.11 |
27+ | qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | tg128 | 2.70 ± 0.00 |
2828
29- build: dccea3c5 (6465 )
29+ build: c665d3c9 (6468 )
3030```
3131
3232## Architecture
@@ -73,7 +73,7 @@ Clean integration point during model loading where NUMA mirrors are established
7373** Purpose** : Model loading with explicit NUMA mirror setup
7474** Key addition** :
7575- Detection of model weight tensors during loading
76- - Call to ` tensor_set_data_with_numa_mirrors() ` for weight tensors
76+ - Call to ` tensor_set_data_with_numa_mirrors() ` for weight tensors at model loading time
7777- Clean integration with existing model loading pipeline
7878
7979#### ` src/llama-mmap.h ` and ` src/llama-mmap.cpp `
@@ -136,12 +136,159 @@ Instead of directly addressing `tensor->data`, instead you do `tensor_data(tenso
136136
137137The ` tensor_data() ` function in ` ggml.h ` has been optimized with a fast path:
138138``` c
139- static inline void * tensor_data (const struct ggml_tensor * tensor) {
140- if (tensor->numa_mirror_data == NULL) {
141- return tensor->data; // Fast path: no NUMA mirrors
139+ // Tensor data accessor functions for NUMA model mirroring compatibility:
140+
141+ // External thread-local variable set at OMP threadpool creation time
142+ extern __thread int ggml_current_numa_node;
143+
144+ static inline void * tensor_data (const struct ggml_tensor * tensor) {
145+ // Fast path: if no NUMA mirrors exist, avoid thread-local access entirely
146+ if (tensor->__ data[ 1] == NULL) {
147+ return tensor->__ data[ 0] ;
148+ }
149+
150+ // NUMA path: only read thread-local variable when NUMA mirrors exist
151+ int numa_node = ggml_current_numa_node;
152+ if (numa_node > 0 && numa_node < GGML_NUMA_MAX_NODES
153+ && tensor->__ data[ numa_node] != NULL) {
154+ return tensor->__ data[ numa_node] ;
155+ }
156+
157+ return tensor->__ data[ 0] ;
158+ }
159+
160+ static inline void tensor_set_data(struct ggml_tensor * tensor, void * data) {
161+ tensor->__data[0] = data;
162+ }
163+
164+ // Model loading specific function - bypasses normal tensor_set_data logic
165+ static inline void tensor_set_data_with_numa_mirrors(struct ggml_tensor * tensor,
166+ void * primary_data,
167+ void ** numa_node_data,
168+ int numa_node_count) {
169+ // Set primary data (node 0)
170+ tensor->__data[0] = primary_data;
171+
172+ // Set NUMA mirrors for other nodes
173+ for (int node = 1; node < numa_node_count && node < GGML_NUMA_MAX_NODES; node++) {
174+ tensor->__data[node] = numa_node_data[node];
175+ }
176+
177+ // Clear remaining slots
178+ for (int node = numa_node_count; node < GGML_NUMA_MAX_NODES; node++) {
179+ tensor->__data[node] = NULL;
180+ }
181+ }
182+ ```
183+
184+ Thread-local variables at OMP thread-creation time in ggml-cpu.c:
185+ ```c
186+
187+ ```
188+
189+ First-touch allocation at model weight loading time in llama-mmap.cpp:
190+ ``` c
191+ // NUMA mirror logic: allocate and populate model weights on each NUMA node
192+ struct numa_mapping {
193+ void* addr;
194+ size_t size;
195+ };
196+ std::vector<numa_mapping> numa_mappings;
197+
198+ // NUMA allocation using first-touch approach with thread affinity binding
199+ void* numa_alloc_first_touch(size_t size, int node) {
200+ // Define SIMD alignment (same as ggml_aligned_malloc)
201+ #if defined(__s390x__)
202+ const size_t alignment = 256;
203+ #else
204+ const size_t alignment = 64;
205+ #endif
206+
207+ // Bind current thread to the target NUMA node for first-touch
208+ struct bitmask* old_mask = numa_get_run_node_mask();
209+ if (numa_run_on_node(node) != 0) {
210+ LLAMA_LOG_DEBUG ("Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
211+ // Continue anyway - might still work
212+ }
213+
214+ // Use posix_memalign for SIMD alignment
215+ void* ptr = nullptr;
216+ int ret = posix_memalign(&ptr, alignment, size);
217+ if (ret != 0) {
218+ LLAMA_LOG_DEBUG("posix_memalign failed for %zu bytes with alignment %zu: %s\n",
219+ size, alignment, strerror(ret));
220+ // Restore original thread binding
221+ if (old_mask) {
222+ numa_run_on_node_mask(old_mask);
223+ numa_free_nodemask(old_mask);
224+ }
225+ return nullptr;
226+ }
227+
228+ // First-touch: touch every page to ensure physical allocation on current node
229+ volatile char* mem = (volatile char* )ptr;
230+ const size_t page_size = sysconf(_ SC_PAGESIZE);
231+ for (size_t i = 0; i < size; i += page_size) {
232+ mem[ i] = 0; // First touch allocates the page on current NUMA node
233+ }
234+
235+ // Restore original thread binding
236+ if (old_mask) {
237+ numa_run_on_node_mask(old_mask);
238+ numa_free_nodemask(old_mask);
239+ }
240+
241+ LLAMA_LOG_DEBUG("✅ First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
242+ size, node, ptr, alignment);
243+ return ptr;
244+ }
245+
246+ void mmap_numa_mirror(struct llama_file * file) {
247+ int num_nodes = numa_num_configured_nodes();
248+ if (num_nodes <= 1) {
249+ throw std::runtime_error("NUMA mirror mode requires multiple NUMA nodes");
250+ }
251+
252+ LLAMA_LOG_DEBUG ("NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
253+ file->size() / (1024.0 * 1024.0), num_nodes);
254+
255+ size_t total_size = file->size();
256+ for (int node = 0; node < num_nodes; ++node) {
257+ LLAMA_LOG_DEBUG("NUMA: Allocating on node %d using first-touch approach\n", node);
258+
259+ void* node_mem = numa_alloc_first_touch(total_size, node);
260+ if (!node_mem) {
261+ for (const auto& mapping : numa_mappings) {
262+ free(mapping.addr); // Use free() for posix_memalign allocated memory
263+ }
264+ throw std::runtime_error("NUMA mirror allocation failed");
265+ }
266+
267+ // VERIFICATION: Check that memory was actually allocated on the expected NUMA node
268+ int actual_node = -1;
269+ if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
270+ LLAMA_LOG_DEBUG("NUMA: Memory at %p allocated on node %d (expected %d)\n",
271+ node_mem, actual_node, node);
272+ if (actual_node != node) {
273+ LLAMA_LOG_WARN("NUMA: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
274+ node, actual_node);
275+ } else {
276+ LLAMA_LOG_DEBUG("NUMA: ✅ First-touch succeeded - memory correctly placed on node %d\n", node);
277+ }
278+ } else {
279+ LLAMA_LOG_WARN("NUMA: Could not verify allocation node for %p: %s\n",
280+ node_mem, strerror(errno));
281+ }
282+
283+ file->seek(0, SEEK_SET);
284+ file->read_raw(node_mem, total_size);
285+ numa_mappings.push_back({node_mem, total_size});
286+
287+ LLAMA_LOG_DEBUG("NUMA: Successfully allocated and populated %.2f MB on node %d at %p\n",
288+ total_size / (1024.0 * 1024.0), node, node_mem);
289+ }
290+ addr = numa_mappings.empty() ? nullptr : numa_mappings[ 0] .addr;
142291 }
143- return ggml_numa_get_tensor_data(tensor); // NUMA-aware routing
144- }
145292```
146293
147294This optimization ensures minimal overhead for intermediate computation tensors while enabling NUMA routing for model weights.
0 commit comments