Skip to content

Commit 48d8d59

Browse files
committed
cleanup more logging, add impl details for LLM agent
1 parent 6d309d5 commit 48d8d59

File tree

2 files changed

+130
-29
lines changed

2 files changed

+130
-29
lines changed

.github/instructions/numa-mirroring-implementation.md

Lines changed: 118 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,13 @@ cmake --build build --parallel
130130
## Implementation Details
131131

132132
### Tensor Data Access Optimization
133-
The `ggml_tensor` struct in `ggml.h` has been updated to no longer have a `data` field. This has been renamed to a `__data[]` array to hold pointers to multiple memory locations, with the index corresponding to the index of a local Numa node.
134133

135-
Instead of directly addressing `tensor->data`, instead you do `tensor_data(tensor)`. And setting is done with `tensor_set_data()`. These are two new macros in `ggml.h`.
136134

137-
The `tensor_data()` function in `ggml.h` has been optimized with a fast path:
135+
In `ggml.h`:
136+
137+
The `ggml_tensor` struct no longer has a `data` field. This has been renamed to a `__data[]` array to hold pointers to multiple memory locations, with the index corresponding to the index of a local Numa node.
138+
139+
Instead of directly addressing `tensor->data`, there are two new macros instead: `tensor_data(tensor)` for getting, and setting is done with `tensor_set_data()`. The `tensor_data()` function in `ggml.h` has been optimized with a fast path.
138140
```c
139141
// Tensor data accessor functions for NUMA model mirroring compatibility:
140142

@@ -181,13 +183,112 @@ The `tensor_data()` function in `ggml.h` has been optimized with a fast path:
181183
}
182184
```
183185
184-
Thread-local variables at OMP thread-creation time in ggml-cpu.c:
186+
In `ggml-cpu.c`: Thread-local variables at OMP thread-creation time
185187
```c
188+
// External thread-local variable for NUMA node binding
189+
extern __thread int ggml_current_numa_node;
190+
191+
// Thread-local NUMA node assignment for OpenMP threads
192+
// Using static initialization to avoid syscalls in hot paths
193+
static __thread int ggml_thread_numa_node = -1;
194+
static __thread bool ggml_thread_numa_initialized = false;
195+
```
186196

197+
In `ggml-cpu.c`: Bind an OMP thread to its Numa node at creation time
198+
```c
199+
if (n_threads > 1) {
200+
#pragma omp parallel num_threads(n_threads)
201+
{
202+
// Bind OpenMP threads to NUMA nodes in round-robin fashion
203+
// This must be done early in the parallel region before any work
204+
ggml_openmp_bind_thread_to_numa_node(omp_get_thread_num(), omp_get_num_threads());
187205
```
188206
189-
First-touch allocation at model weight loading time in llama-mmap.cpp:
207+
In `ggml-cpu.c`: Numa detection and binding logic
190208
```c
209+
bool ggml_is_numa(void) {
210+
// Return true if:
211+
// 1. Multiple physical NUMA nodes are present, OR
212+
// 2. User explicitly requested NUMA mirror strategy (--numa mirror)
213+
return g_state.numa.n_nodes > 1 ||
214+
g_state.numa.numa_strategy == GGML_NUMA_STRATEGY_MIRROR;
215+
}
216+
217+
// Static caching for NUMA thread binding to avoid syscalls in hot OpenMP paths
218+
static void ggml_openmp_bind_thread_to_numa_node(int thread_id, int n_threads) {
219+
// Cache strategy check to avoid repeated calls
220+
static bool strategy_checked = false;
221+
static bool is_numa_mirror = false;
222+
static int num_numa_nodes = 0;
223+
224+
if (!strategy_checked) {
225+
is_numa_mirror = (g_state.numa.numa_strategy == GGML_NUMA_STRATEGY_MIRROR);
226+
if (is_numa_mirror) {
227+
num_numa_nodes = numa_max_node() + 1;
228+
}
229+
strategy_checked = true;
230+
}
231+
232+
// Only apply binding in NUMA mirror mode with multiple nodes
233+
if (!is_numa_mirror || num_numa_nodes <= 1) {
234+
return;
235+
}
236+
237+
// Check if this thread is already initialized to avoid repeated binding
238+
if (ggml_thread_numa_initialized) {
239+
return;
240+
}
241+
242+
// Round-robin assignment of threads to NUMA nodes
243+
int target_numa_node = thread_id % num_numa_nodes;
244+
245+
// Cache CPU masks statically to avoid repeated numa_allocate_cpumask() calls
246+
static struct bitmask *node_cpumasks[GGML_NUMA_MAX_NODES] = {0};
247+
static bool cpumasks_initialized = false;
248+
static cpu_set_t node_cpusets[GGML_NUMA_MAX_NODES];
249+
static bool cpusets_valid[GGML_NUMA_MAX_NODES] = {0};
250+
251+
if (!cpumasks_initialized) {
252+
for (int node = 0; node < num_numa_nodes && node < GGML_NUMA_MAX_NODES; node++) {
253+
node_cpumasks[node] = numa_allocate_cpumask();
254+
if (node_cpumasks[node] && numa_node_to_cpus(node, node_cpumasks[node]) == 0) {
255+
// Convert NUMA bitmask to cpu_set_t for faster thread binding
256+
CPU_ZERO(&node_cpusets[node]);
257+
for (int cpu = 0; cpu < numa_num_possible_cpus(); cpu++) {
258+
if (numa_bitmask_isbitset(node_cpumasks[node], cpu)) {
259+
CPU_SET(cpu, &node_cpusets[node]);
260+
}
261+
}
262+
cpusets_valid[node] = true;
263+
}
264+
}
265+
cpumasks_initialized = true;
266+
}
267+
268+
// Bind thread if we have a valid CPU set for the target node
269+
if (target_numa_node < GGML_NUMA_MAX_NODES && cpusets_valid[target_numa_node]) {
270+
if (sched_setaffinity(0, sizeof(cpu_set_t), &node_cpusets[target_numa_node]) == 0) {
271+
// Set memory allocation preference and thread-local node assignment
272+
numa_set_preferred(target_numa_node);
273+
ggml_thread_numa_node = target_numa_node;
274+
ggml_thread_numa_initialized = true;
275+
276+
// Update the global thread-local variable for tensor data access
277+
ggml_current_numa_node = target_numa_node;
278+
279+
// Debug output using standard GGML logging
280+
GGML_LOG_DEBUG("NUMA: Bound OpenMP thread %d to NUMA node %d (total threads: %d)\n",
281+
thread_id, target_numa_node, n_threads);
282+
}
283+
}
284+
}
285+
```
286+
287+
In `llama-mmap.cpp`: First-touch allocation at model weight loading time
288+
```c
289+
struct llama_mmap::impl {
290+
#ifdef _POSIX_MAPPED_FILES
291+
std::vector<std::pair<size_t, size_t>> mapped_fragments;
191292
// NUMA mirror logic: allocate and populate model weights on each NUMA node
192293
struct numa_mapping {
193294
void* addr;
@@ -207,15 +308,15 @@ First-touch allocation at model weight loading time in llama-mmap.cpp:
207308
// Bind current thread to the target NUMA node for first-touch
208309
struct bitmask* old_mask = numa_get_run_node_mask();
209310
if (numa_run_on_node(node) != 0) {
210-
LLAMA_LOG_DEBUG("Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
311+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
211312
// Continue anyway - might still work
212313
}
213314

214315
// Use posix_memalign for SIMD alignment
215316
void* ptr = nullptr;
216317
int ret = posix_memalign(&ptr, alignment, size);
217318
if (ret != 0) {
218-
LLAMA_LOG_DEBUG("posix_memalign failed for %zu bytes with alignment %zu: %s\n",
319+
LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n",
219320
size, alignment, strerror(ret));
220321
// Restore original thread binding
221322
if (old_mask) {
@@ -238,23 +339,23 @@ First-touch allocation at model weight loading time in llama-mmap.cpp:
238339
numa_free_nodemask(old_mask);
239340
}
240341

241-
LLAMA_LOG_DEBUG(" First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
342+
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
242343
size, node, ptr, alignment);
243344
return ptr;
244345
}
245346

246347
void mmap_numa_mirror(struct llama_file * file) {
247348
int num_nodes = numa_num_configured_nodes();
248349
if (num_nodes <= 1) {
249-
throw std::runtime_error("NUMA mirror mode requires multiple NUMA nodes");
350+
throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
250351
}
251352
252-
LLAMA_LOG_DEBUG("NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
353+
LLAMA_LOG_DEBUG("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
253354
file->size() / (1024.0 * 1024.0), num_nodes);
254355

255356
size_t total_size = file->size();
256357
for (int node = 0; node < num_nodes; ++node) {
257-
LLAMA_LOG_DEBUG("NUMA: Allocating on node %d using first-touch approach\n", node);
358+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Allocating on node %d using first-touch approach\n", node);
258359

259360
void* node_mem = numa_alloc_first_touch(total_size, node);
260361
if (!node_mem) {
@@ -267,24 +368,24 @@ First-touch allocation at model weight loading time in llama-mmap.cpp:
267368
// VERIFICATION: Check that memory was actually allocated on the expected NUMA node
268369
int actual_node = -1;
269370
if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
270-
LLAMA_LOG_DEBUG("NUMA: Memory at %p allocated on node %d (expected %d)\n",
371+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Memory at %p allocated on node %d (expected %d)\n",
271372
node_mem, actual_node, node);
272373
if (actual_node != node) {
273-
LLAMA_LOG_WARN("NUMA: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
374+
LLAMA_LOG_WARN("NUMA MIRRORING: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
274375
node, actual_node);
275376
} else {
276-
LLAMA_LOG_DEBUG("NUMA: ✅ First-touch succeeded - memory correctly placed on node %d\n", node);
377+
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
277378
}
278379
} else {
279-
LLAMA_LOG_WARN("NUMA: Could not verify allocation node for %p: %s\n",
380+
LLAMA_LOG_WARN("NUMA MIRRORING: Could not verify allocation node for %p: %s\n",
280381
node_mem, strerror(errno));
281382
}
282383

283384
file->seek(0, SEEK_SET);
284385
file->read_raw(node_mem, total_size);
285386
numa_mappings.push_back({node_mem, total_size});
286-
287-
LLAMA_LOG_DEBUG("NUMA: Successfully allocated and populated %.2f MB on node %d at %p\n",
387+
388+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Successfully allocated and populated %.2f MB on node %d at %p\n",
288389
total_size / (1024.0 * 1024.0), node, node_mem);
289390
}
290391
addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;

src/llama-mmap.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -313,15 +313,15 @@ struct llama_mmap::impl {
313313
// Bind current thread to the target NUMA node for first-touch
314314
struct bitmask* old_mask = numa_get_run_node_mask();
315315
if (numa_run_on_node(node) != 0) {
316-
LLAMA_LOG_DEBUG("Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
316+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Warning: could not bind thread to NUMA node %d: %s\n", node, strerror(errno));
317317
// Continue anyway - might still work
318318
}
319319

320320
// Use posix_memalign for SIMD alignment
321321
void* ptr = nullptr;
322322
int ret = posix_memalign(&ptr, alignment, size);
323323
if (ret != 0) {
324-
LLAMA_LOG_DEBUG("posix_memalign failed for %zu bytes with alignment %zu: %s\n",
324+
LLAMA_LOG_DEBUG("NUMA MIRRORING: posix_memalign failed for %zu bytes with alignment %zu: %s\n",
325325
size, alignment, strerror(ret));
326326
// Restore original thread binding
327327
if (old_mask) {
@@ -344,23 +344,23 @@ struct llama_mmap::impl {
344344
numa_free_nodemask(old_mask);
345345
}
346346

347-
LLAMA_LOG_DEBUG(" First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
347+
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch allocation: %zu bytes for node %d at %p (SIMD aligned to %zu bytes)\n",
348348
size, node, ptr, alignment);
349349
return ptr;
350350
}
351351

352352
void mmap_numa_mirror(struct llama_file * file) {
353353
int num_nodes = numa_num_configured_nodes();
354354
if (num_nodes <= 1) {
355-
throw std::runtime_error("NUMA mirror mode requires multiple NUMA nodes");
355+
throw std::runtime_error("NUMA MIRRORING: NUMA mirror mode requires multiple NUMA nodes");
356356
}
357357

358-
LLAMA_LOG_DEBUG("NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
358+
LLAMA_LOG_DEBUG("NUMA MIRRORING: NUMA mirroring enabled - allocating %.2f MB on each of %d nodes using first-touch\n",
359359
file->size() / (1024.0 * 1024.0), num_nodes);
360360

361361
size_t total_size = file->size();
362362
for (int node = 0; node < num_nodes; ++node) {
363-
LLAMA_LOG_DEBUG("NUMA: Allocating on node %d using first-touch approach\n", node);
363+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Allocating on node %d using first-touch approach\n", node);
364364

365365
void* node_mem = numa_alloc_first_touch(total_size, node);
366366
if (!node_mem) {
@@ -373,24 +373,24 @@ struct llama_mmap::impl {
373373
// VERIFICATION: Check that memory was actually allocated on the expected NUMA node
374374
int actual_node = -1;
375375
if (get_mempolicy(&actual_node, NULL, 0, node_mem, MPOL_F_NODE | MPOL_F_ADDR) == 0) {
376-
LLAMA_LOG_DEBUG("NUMA: Memory at %p allocated on node %d (expected %d)\n",
376+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Memory at %p allocated on node %d (expected %d)\n",
377377
node_mem, actual_node, node);
378378
if (actual_node != node) {
379-
LLAMA_LOG_WARN("NUMA: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
379+
LLAMA_LOG_WARN("NUMA MIRRORING: WARNING: Memory allocated on wrong node! Expected %d, got %d\n",
380380
node, actual_node);
381381
} else {
382-
LLAMA_LOG_DEBUG("NUMA: ✅ First-touch succeeded - memory correctly placed on node %d\n", node);
382+
LLAMA_LOG_DEBUG("NUMA MIRRORING: First-touch succeeded - memory correctly placed on node %d\n", node);
383383
}
384384
} else {
385-
LLAMA_LOG_WARN("NUMA: Could not verify allocation node for %p: %s\n",
385+
LLAMA_LOG_WARN("NUMA MIRRORING: Could not verify allocation node for %p: %s\n",
386386
node_mem, strerror(errno));
387387
}
388388

389389
file->seek(0, SEEK_SET);
390390
file->read_raw(node_mem, total_size);
391391
numa_mappings.push_back({node_mem, total_size});
392-
393-
LLAMA_LOG_DEBUG("NUMA: Successfully allocated and populated %.2f MB on node %d at %p\n",
392+
393+
LLAMA_LOG_DEBUG("NUMA MIRRORING: Successfully allocated and populated %.2f MB on node %d at %p\n",
394394
total_size / (1024.0 * 1024.0), node, node_mem);
395395
}
396396
addr = numa_mappings.empty() ? nullptr : numa_mappings[0].addr;

0 commit comments

Comments
 (0)