Skip to content

Commit dccea3c

Browse files
committed
NUMA mirroring implementation with inference performance boost
- Achieved 5% inference speed improvement (14.6 -> 15.3 t/s) - Clean explicit NUMA setup during model loading - Ultra-minimal hot path with thread-local NUMA node access - Working NUMA mirrors for all model weights - Performance: text generation improved, prompt processing needs optimization Performance Results (Qwen3-30B-A3B): - Text Generation: 14.6 -> 15.3 t/s (+5% improvement) - Prompt Processing: 176 -> 152 t/s (14% regression - needs investigation) Technical Implementation: - tensor_data(): O(1) NUMA-aware access via thread-local ggml_current_numa_node - tensor_set_data_with_numa_mirrors(): Explicit NUMA setup for model weights - NUMA coordinator: Thread binding and memory locality - Clean separation: model loading (explicit setup) vs inference (fast access)
1 parent aa0c461 commit dccea3c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+2618
-1151
lines changed

common/arg.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2495,12 +2495,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24952495
"- distribute: spread execution evenly over all nodes\n"
24962496
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
24972497
"- numactl: use the CPU map provided by numactl\n"
2498+
"- mirror: enable NUMA-aware model mirroring\n"
24982499
"if run without this previously, it is recommended to drop the system page cache before using this\n"
24992500
"see https://github.com/ggml-org/llama.cpp/issues/1437",
25002501
[](common_params & params, const std::string & value) {
25012502
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
25022503
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
25032504
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
2505+
else if (value == "mirror") { params.numa = GGML_NUMA_STRATEGY_MIRROR; }
25042506
else { throw std::invalid_argument("invalid value"); }
25052507
}
25062508
).set_env("LLAMA_ARG_NUMA"));

common/common.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,7 +1489,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
14891489
// extend if necessary - do not store data for layer 0 (it's not used)
14901490
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
14911491

1492-
const float * src = (const float *) tensor->data;
1492+
const float * src = (const float *) tensor_data(tensor);
14931493
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
14941494
for (int j = 0; j < result.n_embd; j++) {
14951495
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
@@ -1548,8 +1548,8 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
15481548
ggml_opt_dataset_t result = ggml_opt_dataset_init(
15491549
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
15501550

1551-
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
1552-
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
1551+
llama_token * data = (llama_token *) tensor_data(ggml_opt_dataset_data(result));
1552+
llama_token * labels = (llama_token *) tensor_data(ggml_opt_dataset_labels(result));
15531553

15541554
for (int64_t idata = 0; idata < ndata; ++idata) {
15551555
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -408,12 +408,12 @@ static void init_model(struct my_llama_model * model) {
408408
}
409409

410410
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
411-
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
411+
float * ptr = (float *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]);
412412
return *ptr;
413413
}
414414

415415
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
416-
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
416+
int32_t * ptr = (int32_t *) ((char *) tensor_data(tensor) + i0*tensor->nb[0] + i1*tensor->nb[1]);
417417
return *ptr;
418418
}
419419

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
153153
}
154154

155155
if (!ggml_is_quantized(t->type)) {
156-
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
156+
uint8_t * data = is_host ? (uint8_t *) tensor_data(t) : cb_data->data.data();
157157
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
158158
}
159159

examples/gguf-hash/gguf-hash.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
336336
const char * name = gguf_get_tensor_name(ctx, i);
337337
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
338338
auto n_bytes = ggml_nbytes(cur);
339-
auto *raw_data = cur->data;
339+
auto *raw_data = tensor_data(cur);
340340
const std::string tensor_layer_name = fname + ":" + name;
341341

342342
if (hash_params.xxh64) {

examples/gguf/gguf.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ static bool gguf_ex_write(const std::string & fname) {
6363
ggml_set_name(cur, name.c_str());
6464

6565
{
66-
float * data = (float *) cur->data;
66+
float * data = (float *) tensor_data(cur);
6767
for (int j = 0; j < ggml_nelements(cur); ++j) {
6868
data[j] = 100 + i;
6969
}
@@ -201,10 +201,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
201201
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
202202

203203
printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
204-
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
204+
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, tensor_data(cur));
205205

206206
// print first 10 elements
207-
const float * data = (const float *) cur->data;
207+
const float * data = (const float *) tensor_data(cur);
208208

209209
printf("%s data[:10] : ", name);
210210
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
@@ -214,7 +214,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
214214

215215
// check data
216216
if (check_data) {
217-
const float * data = (const float *) cur->data;
217+
const float * data = (const float *) tensor_data(cur);
218218
for (int j = 0; j < ggml_nelements(cur); ++j) {
219219
if (data[j] != 100 + i) {
220220
fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));

fix_tensor_data.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env python3
2+
3+
import re
4+
import sys
5+
import os
6+
7+
def fix_tensor_data_in_file(filepath):
8+
"""Fix tensor->data references in a file"""
9+
try:
10+
with open(filepath, 'r') as f:
11+
content = f.read()
12+
13+
original_content = content
14+
15+
# Fix simple data access patterns (but not assignments)
16+
# Pattern: something->data (but not = something->data)
17+
content = re.sub(r'(\w+)->data(?!\s*=)', r'tensor_data(\1)', content)
18+
19+
# Fix assignments: tensor->data = value -> tensor_set_data(tensor, value)
20+
content = re.sub(r'(\w+)->data\s*=\s*([^;]+);', r'tensor_set_data(\1, \2);', content)
21+
22+
# Fix GGML_ASSERT patterns
23+
content = re.sub(r'GGML_ASSERT\(tensor_data\(([^)]+)\)\s*!=\s*NULL', r'GGML_ASSERT(tensor_data(\1) != NULL', content)
24+
content = re.sub(r'GGML_ASSERT\(tensor_data\(([^)]+)\)\s*==\s*NULL', r'GGML_ASSERT(tensor_data(\1) == NULL', content)
25+
content = re.sub(r'GGML_ASSERT\(tensor_data\(([^)]+)\)', r'GGML_ASSERT(tensor_data(\1)', content)
26+
27+
# Fix memcpy patterns
28+
content = re.sub(r'memcpy\(tensor_data\(([^)]+)\),', r'memcpy(tensor_data(\1),', content)
29+
content = re.sub(r'memcpy\(([^,]+),\s*tensor_data\(([^)]+)\),', r'memcpy(\1, tensor_data(\2),', content)
30+
31+
if content != original_content:
32+
with open(filepath, 'w') as f:
33+
f.write(content)
34+
print(f"Fixed: {filepath}")
35+
return True
36+
else:
37+
print(f"No changes: {filepath}")
38+
return False
39+
40+
except Exception as e:
41+
print(f"Error processing {filepath}: {e}")
42+
return False
43+
44+
def main():
45+
if len(sys.argv) != 2:
46+
print("Usage: python fix_tensor_data.py <file_or_directory>")
47+
sys.exit(1)
48+
49+
target = sys.argv[1]
50+
51+
if os.path.isfile(target):
52+
fix_tensor_data_in_file(target)
53+
elif os.path.isdir(target):
54+
for root, dirs, files in os.walk(target):
55+
for file in files:
56+
if file.endswith(('.c', '.cpp', '.h', '.hpp')):
57+
filepath = os.path.join(root, file)
58+
fix_tensor_data_in_file(filepath)
59+
else:
60+
print(f"Error: {target} is not a valid file or directory")
61+
sys.exit(1)
62+
63+
if __name__ == "__main__":
64+
main()

fix_tensor_data_conservative.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env python3
2+
3+
import re
4+
import sys
5+
import os
6+
7+
def fix_tensor_data_in_file(filepath):
8+
"""Fix tensor->data references in a file, but only for actual tensor variables"""
9+
try:
10+
with open(filepath, 'r') as f:
11+
content = f.read()
12+
13+
original_content = content
14+
15+
# More conservative approach - only fix patterns where we're confident it's a tensor
16+
# Look for common tensor variable names and patterns
17+
18+
# Fix: tensor->data -> tensor_data(tensor)
19+
content = re.sub(r'\btensor->data\b(?!\s*=)', r'tensor_data(tensor)', content)
20+
content = re.sub(r'\bsrc->data\b(?!\s*=)', r'tensor_data(src)', content)
21+
content = re.sub(r'\bdst->data\b(?!\s*=)', r'tensor_data(dst)', content)
22+
content = re.sub(r'\bsrc0->data\b(?!\s*=)', r'tensor_data(src0)', content)
23+
content = re.sub(r'\bsrc1->data\b(?!\s*=)', r'tensor_data(src1)', content)
24+
content = re.sub(r'\bnode->data\b(?!\s*=)', r'tensor_data(node)', content)
25+
content = re.sub(r'\bt->data\b(?!\s*=)', r'tensor_data(t)', content)
26+
content = re.sub(r'\bleaf->data\b(?!\s*=)', r'tensor_data(leaf)', content)
27+
content = re.sub(r'\bview_src->data\b(?!\s*=)', r'tensor_data(view_src)', content)
28+
content = re.sub(r'\bgrad_acc->data\b(?!\s*=)', r'tensor_data(grad_acc)', content)
29+
content = re.sub(r'\binput->data\b(?!\s*=)', r'tensor_data(input)', content)
30+
content = re.sub(r'\bparent->data\b(?!\s*=)', r'tensor_data(parent)', content)
31+
content = re.sub(r'\bids->data\b(?!\s*=)', r'tensor_data(ids)', content)
32+
33+
# Fix assignments: tensor->data = value -> tensor_set_data(tensor, value)
34+
content = re.sub(r'\btensor->data\s*=\s*([^;]+);', r'tensor_set_data(tensor, \1);', content)
35+
content = re.sub(r'\bsrc->data\s*=\s*([^;]+);', r'tensor_set_data(src, \1);', content)
36+
content = re.sub(r'\bdst->data\s*=\s*([^;]+);', r'tensor_set_data(dst, \1);', content)
37+
content = re.sub(r'\bnode->data\s*=\s*([^;]+);', r'tensor_set_data(node, \1);', content)
38+
content = re.sub(r'\bt->data\s*=\s*([^;]+);', r'tensor_set_data(t, \1);', content)
39+
content = re.sub(r'\bnew_tensor->data\s*=\s*([^;]+);', r'tensor_set_data(new_tensor, \1);', content)
40+
41+
if content != original_content:
42+
with open(filepath, 'w') as f:
43+
f.write(content)
44+
print(f"Fixed: {filepath}")
45+
return True
46+
else:
47+
print(f"No changes: {filepath}")
48+
return False
49+
50+
except Exception as e:
51+
print(f"Error processing {filepath}: {e}")
52+
return False
53+
54+
def main():
55+
if len(sys.argv) != 2:
56+
print("Usage: python fix_tensor_data.py <file_or_directory>")
57+
sys.exit(1)
58+
59+
target = sys.argv[1]
60+
61+
if os.path.isfile(target):
62+
fix_tensor_data_in_file(target)
63+
elif os.path.isdir(target):
64+
for root, dirs, files in os.walk(target):
65+
for file in files:
66+
if file.endswith(('.c', '.cpp', '.h', '.hpp')):
67+
filepath = os.path.join(root, file)
68+
fix_tensor_data_in_file(filepath)
69+
else:
70+
print(f"Error: {target} is not a valid file or directory")
71+
sys.exit(1)
72+
73+
if __name__ == "__main__":
74+
main()

ggml/CMakeLists.txt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
198198
"ggml: metal minimum macOS version")
199199
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
200200
option(GGML_OPENMP "ggml: use OpenMP" ON)
201+
option(GGML_NUMA_MIRROR "ggml: support numa aware tensor data" OFF)
202+
option(GGML_NUMA "ggml: support numa aware tensor data (synonym for GGML_NUMA_MIRROR)" OFF)
201203
option(GGML_RPC "ggml: use RPC" OFF)
202204
option(GGML_SYCL "ggml: use SYCL" OFF)
203205
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
@@ -378,6 +380,35 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
378380
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
379381
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
380382

383+
# Make GGML_NUMA and GGML_NUMA_MIRROR synonyms
384+
if (GGML_NUMA AND NOT GGML_NUMA_MIRROR)
385+
set(GGML_NUMA_MIRROR ON)
386+
endif()
387+
if (GGML_NUMA_MIRROR AND NOT GGML_NUMA)
388+
set(GGML_NUMA ON)
389+
endif()
390+
391+
if (GGML_NUMA_MIRROR)
392+
find_library(NUMA_LIBRARY NAMES numa)
393+
if (NOT NUMA_LIBRARY)
394+
message(FATAL_ERROR "libnuma is not found")
395+
endif()
396+
message(STATUS "libnuma: ${NUMA_LIBRARY}")
397+
398+
message(STATUS
399+
"-----------------\n"
400+
"Enabling GGML_NUMA_MIRROR (GGML_NUMA compatibility enabled)\n"
401+
"Uses numa_alloc_onnode() for reliable NUMA-aware memory allocation")
402+
message(STATUS
403+
"-----------------")
404+
405+
foreach(lib "ggml" "ggml-base")
406+
target_compile_definitions(${lib} PUBLIC GGML_NUMA_MIRROR)
407+
target_compile_definitions(${lib} PUBLIC GGML_NUMA)
408+
target_link_libraries(${lib} PUBLIC ${NUMA_LIBRARY})
409+
endforeach()
410+
endif()
411+
381412
if (MSVC)
382413
set(MSVC_WARNING_FLAGS
383414
/wd4005 # Macro redefinition

ggml/include/ggml.h

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,13 @@
221221
#define GGML_MAX_N_THREADS 512
222222
#define GGML_MAX_OP_PARAMS 64
223223

224+
#ifdef GGML_NUMA_MIRROR
225+
// maximum number of NUMA nodes for tensor data mirroring
226+
#define GGML_NUMA_MAX_NODES 8
227+
#include <numaif.h>
228+
#include <string.h>
229+
#endif
230+
224231
#ifndef GGML_MAX_NAME
225232
# define GGML_MAX_NAME 64
226233
#endif
@@ -645,17 +652,86 @@ extern "C" {
645652
struct ggml_tensor * view_src;
646653
size_t view_offs;
647654

655+
#ifdef GGML_NUMA_MIRROR
656+
union {
657+
#ifdef __NVCC__
658+
void * data;
659+
#endif
660+
void * __data[GGML_NUMA_MAX_NODES];
661+
};
662+
#else
648663
void * data;
664+
#endif
649665

650666
char name[GGML_MAX_NAME];
651667

652668
void * extra; // extra things e.g. for ggml-cuda.cu
653669

670+
#ifdef GGML_NUMA_MIRROR
671+
char padding[12]; // Adjusted for expanded __data array
672+
#else
654673
char padding[8];
674+
#endif
655675
};
656676

657677
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
658678

679+
// Tensor data accessor functions for NUMA compatibility
680+
681+
#ifdef GGML_NUMA_MIRROR
682+
// External thread-local variable set by NUMA coordinator
683+
extern __thread int ggml_current_numa_node;
684+
685+
static inline void * tensor_data(const struct ggml_tensor * tensor) {
686+
int numa_node = ggml_current_numa_node;
687+
688+
if (numa_node >= 0 && numa_node < GGML_NUMA_MAX_NODES
689+
&& tensor->__data[numa_node] != NULL) {
690+
return tensor->__data[numa_node];
691+
}
692+
693+
return tensor->__data[0];
694+
}
695+
696+
static inline void tensor_set_data(struct ggml_tensor * tensor, void * data) {
697+
tensor->__data[0] = data;
698+
}
699+
700+
#ifdef GGML_NUMA_MIRROR
701+
// Model loading specific function - bypasses normal tensor_set_data logic
702+
static inline void tensor_set_data_with_numa_mirrors(struct ggml_tensor * tensor,
703+
void * primary_data,
704+
void ** numa_node_data,
705+
int numa_node_count) {
706+
// Set primary data (node 0)
707+
tensor->__data[0] = primary_data;
708+
709+
// Set NUMA mirrors for other nodes
710+
for (int node = 1; node < numa_node_count && node < GGML_NUMA_MAX_NODES; node++) {
711+
tensor->__data[node] = numa_node_data[node];
712+
}
713+
714+
// Clear remaining slots
715+
for (int node = numa_node_count; node < GGML_NUMA_MAX_NODES; node++) {
716+
tensor->__data[node] = NULL;
717+
}
718+
719+
#ifdef GGML_NUMA_DEBUG_VERBOSE
720+
printf("✅ NUMA SETUP COMPLETE: %s with %d mirrors\n", tensor->name, numa_node_count - 1);
721+
fflush(stdout);
722+
#endif
723+
}
724+
#endif
725+
#else
726+
static inline void * tensor_data(const struct ggml_tensor * tensor) {
727+
return tensor->data;
728+
}
729+
730+
static inline void tensor_set_data(struct ggml_tensor * tensor, void * data) {
731+
tensor->data = data;
732+
}
733+
#endif
734+
659735
// Abort callback
660736
// If not NULL, called before ggml computation
661737
// If it returns true, the computation is aborted
@@ -2541,6 +2617,9 @@ extern "C" {
25412617
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
25422618
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
25432619

2620+
// NUMA functions
2621+
GGML_API int ggml_numa_node_count(void);
2622+
25442623
#ifdef __cplusplus
25452624
}
25462625
#endif

0 commit comments

Comments
 (0)