Skip to content

Commit f57ea5f

Browse files
committed
Much better thread and numa node handling. New options: --cpu-no-hyperthreading, --cpu-no-efficiency-cores
1 parent fa72aa3 commit f57ea5f

File tree

8 files changed

+755
-102
lines changed

8 files changed

+755
-102
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,4 @@ poetry.toml
146146
# Local scripts
147147
/run-vim.sh
148148
/run-chat.sh
149+
Testing/Temporary/CTestCostData.txt

common/arg.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13861386
params.cpuparams_batch.strict_cpu = value;
13871387
}
13881388
));
1389+
add_opt(common_arg(
1390+
{"--cpu-no-hyperthreading"},
1391+
"disable hyperthreading/SMT for math operations (use only physical cores)",
1392+
[](common_params & params) {
1393+
params.cpuparams.use_hyperthreading = false;
1394+
}
1395+
));
1396+
add_opt(common_arg(
1397+
{"--cpu-no-efficiency-cores"},
1398+
"disable efficiency cores (E-cores) for math operations (use only performance cores)",
1399+
[](common_params & params) {
1400+
params.cpuparams.use_efficiency_cores = false;
1401+
}
1402+
));
1403+
add_opt(common_arg(
1404+
{"--cpu-topology"},
1405+
"print detailed CPU topology information and exit",
1406+
[](common_params & /*params*/) {
1407+
cpu_print_topology_info();
1408+
exit(0);
1409+
}
1410+
));
13891411
add_opt(common_arg(
13901412
{"--prio-batch"}, "N",
13911413
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),

common/common.cpp

Lines changed: 197 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ int32_t cpu_get_num_physical_cores() {
121121

122122
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
123123
#include <pthread.h>
124+
#include <map>
125+
#include <set>
124126

125127
static void cpuid(unsigned leaf, unsigned subleaf,
126128
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
@@ -152,19 +154,116 @@ static bool is_running_on_efficiency_core(void) {
152154
return core_type == intel_atom;
153155
}
154156

155-
static int cpu_count_math_cpus(int n_cpu) {
156-
int result = 0;
157-
for (int cpu = 0; cpu < n_cpu; ++cpu) {
158-
if (pin_cpu(cpu)) {
159-
return -1;
157+
// Structure to hold detailed CPU topology information
158+
struct cpu_topology_info {
159+
int total_logical_cpus;
160+
int total_physical_cores;
161+
int performance_cores;
162+
int efficiency_cores;
163+
std::vector<std::vector<int>> core_siblings; // Groups of hyperthreaded CPUs
164+
std::vector<int> performance_cpus; // CPU IDs that are performance cores
165+
std::vector<int> efficiency_cpus; // CPU IDs that are efficiency cores
166+
};
167+
168+
static cpu_topology_info detect_cpu_topology() {
169+
cpu_topology_info info = {};
170+
info.total_logical_cpus = sysconf(_SC_NPROCESSORS_ONLN);
171+
172+
// Map to group CPUs by their thread siblings
173+
std::map<std::string, std::vector<int>> sibling_groups;
174+
175+
// Read topology information for each CPU
176+
for (int cpu = 0; cpu < info.total_logical_cpus; ++cpu) {
177+
// Read thread siblings to identify hyperthreading groups
178+
std::ifstream siblings_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list");
179+
if (siblings_file.is_open()) {
180+
std::string siblings_str;
181+
std::getline(siblings_file, siblings_str);
182+
sibling_groups[siblings_str].push_back(cpu);
160183
}
161-
if (is_running_on_efficiency_core()) {
162-
continue; // efficiency cores harm lockstep threading
184+
185+
// Test if this CPU is a performance or efficiency core
186+
if (pin_cpu(cpu) == 0) {
187+
if (is_running_on_efficiency_core()) {
188+
info.efficiency_cpus.push_back(cpu);
189+
} else {
190+
info.performance_cpus.push_back(cpu);
191+
}
163192
}
164-
++cpu; // hyperthreading isn't useful for linear algebra
165-
++result;
166193
}
167-
return result;
194+
195+
// Convert sibling groups to core_siblings vector
196+
for (const auto& group : sibling_groups) {
197+
info.core_siblings.push_back(group.second);
198+
}
199+
200+
info.total_physical_cores = info.core_siblings.size();
201+
info.performance_cores = info.performance_cpus.size();
202+
info.efficiency_cores = info.efficiency_cpus.size();
203+
204+
return info;
205+
}
206+
207+
static int cpu_count_math_cpus(int n_cpu, bool use_hyperthreading = false, bool use_efficiency_cores = false) {
208+
GGML_UNUSED(n_cpu);
209+
cpu_topology_info topo = detect_cpu_topology();
210+
211+
std::vector<int> selected_cpus;
212+
213+
// First, select which types of cores to use
214+
std::vector<int> candidate_cpus;
215+
if (!use_efficiency_cores) {
216+
// Use only performance cores
217+
candidate_cpus = topo.performance_cpus;
218+
} else {
219+
// Use all cores
220+
candidate_cpus.reserve(topo.total_logical_cpus);
221+
candidate_cpus.insert(candidate_cpus.end(), topo.performance_cpus.begin(), topo.performance_cpus.end());
222+
candidate_cpus.insert(candidate_cpus.end(), topo.efficiency_cpus.begin(), topo.efficiency_cpus.end());
223+
}
224+
225+
if (use_hyperthreading) {
226+
// Use all candidate CPUs
227+
selected_cpus = candidate_cpus;
228+
} else {
229+
// Select only one CPU per physical core
230+
std::set<int> used_cores;
231+
for (int cpu : candidate_cpus) {
232+
// Find which core group this CPU belongs to
233+
for (const auto& core_group : topo.core_siblings) {
234+
if (std::find(core_group.begin(), core_group.end(), cpu) != core_group.end()) {
235+
// Use a hash of the core group to identify unique cores
236+
std::string core_id;
237+
for (int sibling : core_group) {
238+
core_id += std::to_string(sibling) + ",";
239+
}
240+
size_t core_hash = std::hash<std::string>{}(core_id);
241+
242+
if (used_cores.find(core_hash) == used_cores.end()) {
243+
selected_cpus.push_back(cpu);
244+
used_cores.insert(core_hash);
245+
}
246+
break;
247+
}
248+
}
249+
}
250+
}
251+
252+
// Validate selected CPUs by attempting to pin to them
253+
int valid_count = 0;
254+
cpu_set_t original_affinity;
255+
pthread_getaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);
256+
257+
for (int cpu : selected_cpus) {
258+
if (pin_cpu(cpu) == 0) {
259+
valid_count++;
260+
}
261+
}
262+
263+
// Restore original affinity
264+
pthread_setaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);
265+
266+
return valid_count;
168267
}
169268

170269
#endif // __x86_64__ && __linux__
@@ -178,10 +277,40 @@ int32_t cpu_get_num_math() {
178277
if (n_cpu < 1) {
179278
return cpu_get_num_physical_cores();
180279
}
280+
281+
if (is_hybrid_cpu()) {
282+
cpu_set_t affinity;
283+
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
284+
// Default behavior: use hyperthreading and efficiency cores for math
285+
// This can be overridden by environment variables or command-line options
286+
bool use_hyperthreading = std::getenv("LLAMA_CPU_NO_HYPERTHREADING") == nullptr;
287+
bool use_efficiency_cores = std::getenv("LLAMA_CPU_NO_EFFICIENCY_CORES") == nullptr;
288+
289+
int result = cpu_count_math_cpus(n_cpu, use_hyperthreading, use_efficiency_cores);
290+
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
291+
if (result > 0) {
292+
return result;
293+
}
294+
}
295+
}
296+
#endif
297+
return cpu_get_num_physical_cores();
298+
}
299+
300+
/**
301+
* Returns number of CPUs on system that are useful for math, respecting cpu_params.
302+
*/
303+
int32_t cpu_get_num_math_from_params(const cpu_params & params) {
304+
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
305+
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
306+
if (n_cpu < 1) {
307+
return cpu_get_num_physical_cores();
308+
}
309+
181310
if (is_hybrid_cpu()) {
182311
cpu_set_t affinity;
183312
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
184-
int result = cpu_count_math_cpus(n_cpu);
313+
int result = cpu_count_math_cpus(n_cpu, params.use_hyperthreading, params.use_efficiency_cores);
185314
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
186315
if (result > 0) {
187316
return result;
@@ -192,6 +321,62 @@ int32_t cpu_get_num_math() {
192321
return cpu_get_num_physical_cores();
193322
}
194323

324+
/**
325+
* Print CPU topology information for debugging
326+
*/
327+
void cpu_print_topology_info() {
328+
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
329+
if (is_hybrid_cpu()) {
330+
cpu_topology_info topo = detect_cpu_topology();
331+
332+
printf("CPU Topology Information:\n");
333+
printf(" Total logical CPUs: %d\n", topo.total_logical_cpus);
334+
printf(" Total physical cores: %d\n", topo.total_physical_cores);
335+
printf(" Performance cores: %d\n", topo.performance_cores);
336+
printf(" Efficiency cores: %d\n", topo.efficiency_cores);
337+
338+
printf(" Performance CPU IDs: ");
339+
for (size_t i = 0; i < topo.performance_cpus.size(); ++i) {
340+
if (i > 0) printf(", ");
341+
printf("%d", topo.performance_cpus[i]);
342+
}
343+
printf("\n");
344+
345+
if (!topo.efficiency_cpus.empty()) {
346+
printf(" Efficiency CPU IDs: ");
347+
for (size_t i = 0; i < topo.efficiency_cpus.size(); ++i) {
348+
if (i > 0) printf(", ");
349+
printf("%d", topo.efficiency_cpus[i]);
350+
}
351+
printf("\n");
352+
}
353+
354+
printf(" Core sibling groups (hyperthreading):\n");
355+
for (size_t i = 0; i < topo.core_siblings.size(); ++i) {
356+
printf(" Core %zu: ", i);
357+
for (size_t j = 0; j < topo.core_siblings[i].size(); ++j) {
358+
if (j > 0) printf(", ");
359+
printf("%d", topo.core_siblings[i][j]);
360+
}
361+
printf("\n");
362+
}
363+
364+
// Show what would be selected with different options
365+
printf("\n Thread count recommendations:\n");
366+
printf(" Default (P-cores + hyperthreading): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, false));
367+
printf(" Without hyperthreading: %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, false));
368+
printf(" With E-cores (+ HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, true));
369+
printf(" With E-cores (no HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, true));
370+
} else {
371+
printf("CPU Topology: Non-hybrid CPU detected\n");
372+
printf(" Physical cores: %d\n", cpu_get_num_physical_cores());
373+
printf(" Logical CPUs: %d\n", (int)std::thread::hardware_concurrency());
374+
}
375+
#else
376+
printf("CPU topology detection not available on this platform\n");
377+
#endif
378+
}
379+
195380
// Helper for setting process priority
196381

197382
#if defined(_WIN32)
@@ -258,7 +443,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
258443
if (role_model != nullptr) {
259444
cpuparams = *role_model;
260445
} else {
261-
cpuparams.n_threads = cpu_get_num_math();
446+
cpuparams.n_threads = cpu_get_num_math_from_params(cpuparams);
262447
}
263448
}
264449

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,14 @@ struct cpu_params {
5555
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
5656
bool strict_cpu = false; // Use strict CPU placement
5757
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
58+
bool use_hyperthreading = true; // Use hyperthreading/SMT for math operations (enabled by default)
59+
bool use_efficiency_cores = true; // Use efficiency cores (E-cores) for math operations (enabled by default)
5860
};
5961

6062
int32_t cpu_get_num_physical_cores();
6163
int32_t cpu_get_num_math();
64+
int32_t cpu_get_num_math_from_params(const cpu_params & params);
65+
void cpu_print_topology_info();
6266

6367
//
6468
// Common params

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2853,7 +2853,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28532853
#ifdef GGML_NUMA_MIRROR
28542854
if (GGML_UNLIKELY(ggml_current_numa_node == -1)) {
28552855
int thread_id = state->ith;
2856-
2856+
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
2857+
2858+
// Distribute threads evenly across NUMA nodes first, then assign CPUs within each node
2859+
int num_numa_nodes = numa_num_configured_nodes();
2860+
if (num_numa_nodes <= 0) num_numa_nodes = 1;
2861+
2862+
// Calculate which NUMA node this thread should use
2863+
int target_numa_node = thread_id % num_numa_nodes;
2864+
28572865
bool cpumask[GGML_MAX_N_THREADS];
28582866
memset(cpumask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
28592867
for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
@@ -2863,17 +2871,34 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28632871
}
28642872

28652873
int cpuid = -1;
2866-
bool local_mask[GGML_MAX_N_THREADS];
2867-
int iter = 0;
2868-
for (int j = 0; j < thread_id; ++j) {
2869-
ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
2874+
2875+
// Try to find a CPU on the target NUMA node
2876+
struct bitmask* node_cpus = numa_allocate_cpumask();
2877+
if (numa_node_to_cpus(target_numa_node, node_cpus) == 0) {
2878+
// Find the first available CPU on the target NUMA node that's also in our allowed set
2879+
for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
2880+
if (cpumask[i] && numa_bitmask_isbitset(node_cpus, i)) {
2881+
cpuid = i;
2882+
break;
2883+
}
2884+
}
28702885
}
2871-
memset(local_mask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
2872-
ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
2873-
for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
2874-
if (local_mask[i]) {
2875-
cpuid = i;
2876-
break;
2886+
numa_free_cpumask(node_cpus);
2887+
2888+
// Fallback: if we couldn't find a CPU on the target node, use the original algorithm
2889+
if (cpuid == -1) {
2890+
bool local_mask[GGML_MAX_N_THREADS];
2891+
int iter = 0;
2892+
for (int j = 0; j < thread_id; ++j) {
2893+
ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
2894+
}
2895+
memset(local_mask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
2896+
ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
2897+
for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
2898+
if (local_mask[i]) {
2899+
cpuid = i;
2900+
break;
2901+
}
28772902
}
28782903
}
28792904

@@ -2891,8 +2916,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
28912916
struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes());
28922917
numa_bitmask_setbit(mask, ggml_current_numa_node);
28932918
numa_set_membind(mask);
2919+
numa_bitmask_free(mask);
28942920

2895-
GGML_LOG_INFO("thread_id = %02d, node = %d, cpuid = %02d\n", thread_id, ggml_current_numa_node, cpuid);
2921+
GGML_LOG_INFO("thread_id = %02d, target_node = %d, actual_node = %d, cpuid = %02d, n_threads = %d\n",
2922+
thread_id, target_numa_node, ggml_current_numa_node, cpuid, n_threads);
28962923
}
28972924
#endif // GGML_NUMA_MIRROR
28982925

0 commit comments

Comments
 (0)