Skip to content

Commit 6d309d5

Browse files
committed
cleanup refs and logging
1 parent d357ef5 commit 6d309d5

File tree

8 files changed

+184
-313
lines changed

8 files changed

+184
-313
lines changed

.github/instructions/numa-mirroring-implementation.md

Lines changed: 158 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@ developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$
1919

2020
With numa mirroring:
2121
```
22-
build: dccea3c5 (6465)
23-
developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror$ cd /workspaces/llama-cpp-dbsanfte-dev/llama-cpp-numa-mirror && ./build-release/bin/llama-bench -m ../.devcontainer/Qwen3-32B-Q6_K.gguf --numa mirror
22+
developer@81ec6c6e6af6:/workspaces/llama-cpp-dbsanfte-dev$ ./build/bin/llama-bench -m .
23+
/.devcontainer/Qwen3-32B-Q6_K.gguf --numa mirror
2424
| model | size | params | backend | threads | test | t/s |
2525
| ------------------------------ | ---------: | ---------: | ---------- | ------: | --------------: | -------------------: |
26-
| qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | pp512 | 16.22 ± 0.30 |
27-
| qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | tg128 | 2.80 ± 0.00 |
26+
| qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | pp512 | 21.36 ± 0.11 |
27+
| qwen3 32B Q6_K | 25.03 GiB | 32.76 B | CPU | 56 | tg128 | 2.70 ± 0.00 |
2828
29-
build: dccea3c5 (6465)
29+
build: c665d3c9 (6468)
3030
```
3131

3232
## Architecture
@@ -73,7 +73,7 @@ Clean integration point during model loading where NUMA mirrors are established
7373
**Purpose**: Model loading with explicit NUMA mirror setup
7474
**Key addition**:
7575
- Detection of model weight tensors during loading
76-
- Call to `tensor_set_data_with_numa_mirrors()` for weight tensors
76+
- Call to `tensor_set_data_with_numa_mirrors()` for weight tensors at model loading time
7777
- Clean integration with existing model loading pipeline
7878

7979
#### `src/llama-mmap.h` and `src/llama-mmap.cpp`
@@ -136,12 +136,159 @@ Instead of directly addressing `tensor->data`, instead you do `tensor_data(tenso
136136

137137
The `tensor_data()` function in `ggml.h` has been optimized with a fast path:
138138
```c
139-
static inline void * tensor_data(const struct ggml_tensor * tensor) {
140-
if (tensor->numa_mirror_data == NULL) {
141-
return tensor->data; // Fast path: no NUMA mirrors
139+
// Tensor data accessor functions for NUMA model mirroring compatibility:
140+
141+
// External thread-local variable set at OMP threadpool creation time
142+
extern __thread int ggml_current_numa_node;
143+
144+
static inline void * tensor_data(const struct ggml_tensor * tensor) {
145+
// Fast path: if no NUMA mirrors exist, avoid thread-local access entirely
146+
if (tensor->__data[1] == NULL) {
147+
return tensor->__data[0];
148+
}
149+
150+
// NUMA path: only read thread-local variable when NUMA mirrors exist
151+
int numa_node = ggml_current_numa_node;
152+
if (numa_node > 0 && numa_node < GGML_NUMA_MAX_NODES
153+
&& tensor->__data[numa_node] != NULL) {
154+
return tensor->__data[numa_node];
155+
}
156+
157+
return tensor->__data[0];
158+
}
159+
160+
static inline void tensor_set_data(struct ggml_tensor * tensor, void * data) {
161+
tensor->__data[0] = data;
162+
}
163+
164+
// Model loading specific function - bypasses normal tensor_set_data logic
165+
static inline void tensor_set_data_with_numa_mirrors(struct ggml_tensor * tensor,
166+
void * primary_data,
167+
void ** numa_node_data,
168+
int numa_node_count) {
169+
// Set primary data (node 0)
170+
tensor->__data[0] = primary_data;
171+
172+
// Set NUMA mirrors for other nodes
173+
for (int node = 1; node < numa_node_count && node < GGML_NUMA_MAX_NODES; node++) {
174+
tensor->__data[node] = numa_node_data[node];
175+
}
176+
177+
// Clear remaining slots
178+
for (int node = numa_node_count; node < GGML_NUMA_MAX_NODES; node++) {
179+
tensor->__data[node] = NULL;
180+
}
181+
}
182+
```
183+
184+
Thread-local variables at OMP thread-creation time in ggml-cpu.c:
185+
```c
186+
187+
```
188+
189+
First-touch allocation at model weight loading time in llama-mmap.cpp:
190+
```c
191+
// NUMA mirror logic: allocate and populate model weights on each NUMA node
192+
struct numa_mapping {
193+
void* addr;
194+
size_t size;
195+
};
196+
std::vector<numa_mapping> numa_mappings;
197+
198+
// NUMA allocation using first-touch approach with thread affinity binding
199+
void* numa_alloc_first_touch(size_t size, int node) {
200+
// Define SIMD alignment (same as ggml_aligned_malloc)
201+
#if defined(__s390x__)
202+
const size_t alignment = 256;
203+
#else
204+
const size_t alignment = 64;
205+
#endif
206+
207+
// Bind current thread to the target NUMA node for first-touch
208+
struct bitmask* old_mask = numa_get_run_node_mask();
209+
if (numa_run_on_node(node) != 0) {
210+
LLAMA_LOG_DEBUG("Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
211+
// Continue anyway - might still work
212+
}
213+
214+
// Use posix_memalign for SIMD alignment
215+
void* ptr = nullptr;
216+
int ret = posix_memalign(&ptr, alignment, size);
217+
if (ret != 0) {
218+
LLAMA_LOG_DEBUG("posix_memalign failed for %zu bytes with alignment %zu: %s\n",
219+
size, alignment, strerror(ret));
220+
// Restore original thread binding
221+
if (old_mask) {
222+
numa_run_on_node_mask(old_mask);
223+
numa_free_nodemask(old_mask);
224+
}
225+
return nullptr;
226+
}
227+
228+
// First-touch: touch every page to ensure physical allocation on current node
229+
volatile char* mem = (volatile char*)ptr;
230+
const size_t page_size = sysconf(_SC_PAGESIZE);
231+
for (size_t i = 0; i < size; i += page_size) {
232+
mem[i] = 0; // First touch allocates the page on current NUMA node
233+
}
234+
235+
// Restore original thread binding
236+
if (old_mask) {
237+
numa_run_on_node_mask(old_mask);
238+
numa_free_nodemask(old_mask);
239+
}
240+
241+
LLAMA_LOG_DEBUG("✅ First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
242+
size, node, ptr, alignment);
243+
return ptr;
244+
}
245+
246+
void mmap_numa_mirror(struct llama_file * file) {
247+
int num_nodes = numa_num_configured_nodes();
248+
if (num_nodes <= 1) {
249+
throw std::runtime_error("NUMA mirror mode requires multiple NUMA nodes");
250+
}
251+
252+
LLAMA_LOG_DEBUG("NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
253+
file->size() / (1024.0 * 1024.0), num_nodes);
254+
255+
size_t total_size = file->size();
256+
for (int node = 0; node < num_nodes; ++node) {
257+
LLAMA_LOG_DEBUG("NUMA: Allocating on node %d using first-touch approach\n", node);
258+
259+
void* node_mem = numa_alloc_first_touch(total_size, node);
260+
if (!node_mem) {
261+
for (const auto& mapping : numa_mappings) {
262+
free(mapping.addr); // Use free() for posix_memalign allocated memory
263+
}
264+
throw std::runtime_error("NUMA mirror allocation failed");
265+
}
266+
267+
// VERIFICATION: Check that memory was actually allocated on the expected NUMA node
268+
int actual_node = -1;
269+
if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
270+
LLAMA_LOG_DEBUG("NUMA: Memory at %p allocated on node %d (expected %d)\n",
271+
node_mem, actual_node, node);
272+
if (actual_node != node) {
273+
LLAMA_LOG_WARN("NUMA: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
274+
node, actual_node);
275+
} else {
276+
LLAMA_LOG_DEBUG("NUMA: ✅ First-touch succeeded - memory correctly placed on node %d\n", node);
277+
}
278+
} else {
279+
LLAMA_LOG_WARN("NUMA: Could not verify allocation node for %p: %s\n",
280+
node_mem, strerror(errno));
281+
}
282+
283+
file->seek(0, SEEK_SET);
284+
file->read_raw(node_mem, total_size);
285+
numa_mappings.push_back({node_mem, total_size});
286+
287+
LLAMA_LOG_DEBUG("NUMA: Successfully allocated and populated %.2f MB on node %d at %p\n",
288+
total_size / (1024.0 * 1024.0), node, node_mem);
289+
}
290+
addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;
142291
}
143-
return ggml_numa_get_tensor_data(tensor); // NUMA-aware routing
144-
}
145292
```
146293
147294
This optimization ensures minimal overhead for intermediate computation tensors while enabling NUMA routing for model weights.

fix_tensor_data.py

Lines changed: 0 additions & 64 deletions
This file was deleted.

fix_tensor_data_conservative.py

Lines changed: 0 additions & 74 deletions
This file was deleted.

ggml/include/ggml.h

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -651,9 +651,6 @@ extern "C" {
651651
size_t view_offs;
652652

653653
union {
654-
#ifdef __NVCC__
655-
void * data;
656-
#endif
657654
void * __data[GGML_NUMA_MAX_NODES];
658655
};
659656

@@ -666,9 +663,9 @@ extern "C" {
666663

667664
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
668665

669-
// Tensor data accessor functions for NUMA compatibility
666+
// Tensor data accessor functions for NUMA model mirroring compatibility:
670667

671-
// External thread-local variable set by NUMA coordinator
668+
// External thread-local variable set at OMP threadpool creation time
672669
extern __thread int ggml_current_numa_node;
673670

674671
static inline void * tensor_data(const struct ggml_tensor * tensor) {
@@ -708,11 +705,6 @@ extern "C" {
708705
for (int node = numa_node_count; node < GGML_NUMA_MAX_NODES; node++) {
709706
tensor->__data[node] = NULL;
710707
}
711-
712-
#ifdef GGML_NUMA_DEBUG_VERBOSE
713-
printf("✅ NUMA SETUP COMPLETE: %s with %d mirrors\n", tensor->name, numa_node_count - 1);
714-
fflush(stdout);
715-
#endif
716708
}
717709

718710
// Abort callback

0 commit comments

Comments
 (0)