experimental fixes for --threads and numa

dbsanfte · dbsanfte · commit 92593e72efa7 · 2025-07-31T11:42:53.000+01:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1386,6 +1386,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.cpuparams_batch.strict_cpu = value;
         }
     ));
+    add_opt(common_arg(
+        {"--no-hyperthreading"}, "",
+        "disable hyperthreading/SMT for math operations (use only physical cores)",
+        [](common_params & params) {
+            params.cpuparams.use_hyperthreading = false;
+        }
+    ));
+    add_opt(common_arg(
+        {"--use-efficiency-cores"}, "",
+        "use efficiency cores (E-cores) for math operations (may degrade performance)",
+        [](common_params & params) {
+            params.cpuparams.use_efficiency_cores = true;
+        }
+    ));
+    add_opt(common_arg(
+        {"--cpu-topology"}, "",
+        "print detailed CPU topology information and exit",
+        [](common_params & params) {
+            cpu_print_topology_info();
+            exit(0);
+        }
+    ));
     add_opt(common_arg(
         {"--prio-batch"}, "N",
         string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
diff --git a/common/common.cpp b/common/common.cpp
@@ -121,6 +121,8 @@ int32_t cpu_get_num_physical_cores() {
 
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
 #include <pthread.h>
+#include <map>
+#include <set>
 
 static void cpuid(unsigned leaf, unsigned subleaf,
                   unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
@@ -152,19 +154,115 @@ static bool is_running_on_efficiency_core(void) {
     return core_type == intel_atom;
 }
 
-static int cpu_count_math_cpus(int n_cpu) {
-    int result = 0;
-    for (int cpu = 0; cpu < n_cpu; ++cpu) {
-        if (pin_cpu(cpu)) {
-            return -1;
+// Structure to hold detailed CPU topology information
+struct cpu_topology_info {
+    int total_logical_cpus;
+    int total_physical_cores;
+    int performance_cores;
+    int efficiency_cores;
+    std::vector<std::vector<int>> core_siblings; // Groups of hyperthreaded CPUs
+    std::vector<int> performance_cpus;           // CPU IDs that are performance cores
+    std::vector<int> efficiency_cpus;            // CPU IDs that are efficiency cores
+};
+
+static cpu_topology_info detect_cpu_topology() {
+    cpu_topology_info info = {};
+    info.total_logical_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+    
+    // Map to group CPUs by their thread siblings
+    std::map<std::string, std::vector<int>> sibling_groups;
+    
+    // Read topology information for each CPU
+    for (int cpu = 0; cpu < info.total_logical_cpus; ++cpu) {
+        // Read thread siblings to identify hyperthreading groups
+        std::ifstream siblings_file("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list");
+        if (siblings_file.is_open()) {
+            std::string siblings_str;
+            std::getline(siblings_file, siblings_str);
+            sibling_groups[siblings_str].push_back(cpu);
         }
-        if (is_running_on_efficiency_core()) {
-            continue; // efficiency cores harm lockstep threading
+        
+        // Test if this CPU is a performance or efficiency core
+        if (pin_cpu(cpu) == 0) {
+            if (is_running_on_efficiency_core()) {
+                info.efficiency_cpus.push_back(cpu);
+            } else {
+                info.performance_cpus.push_back(cpu);
+            }
         }
-        ++cpu; // hyperthreading isn't useful for linear algebra
-        ++result;
     }
-    return result;
+    
+    // Convert sibling groups to core_siblings vector
+    for (const auto& group : sibling_groups) {
+        info.core_siblings.push_back(group.second);
+    }
+    
+    info.total_physical_cores = info.core_siblings.size();
+    info.performance_cores = info.performance_cpus.size();
+    info.efficiency_cores = info.efficiency_cpus.size();
+    
+    return info;
+}
+
+static int cpu_count_math_cpus(int n_cpu, bool use_hyperthreading = false, bool use_efficiency_cores = false) {
+    cpu_topology_info topo = detect_cpu_topology();
+    
+    std::vector<int> selected_cpus;
+    
+    // First, select which types of cores to use
+    std::vector<int> candidate_cpus;
+    if (!use_efficiency_cores) {
+        // Use only performance cores
+        candidate_cpus = topo.performance_cpus;
+    } else {
+        // Use all cores
+        candidate_cpus.reserve(topo.total_logical_cpus);
+        candidate_cpus.insert(candidate_cpus.end(), topo.performance_cpus.begin(), topo.performance_cpus.end());
+        candidate_cpus.insert(candidate_cpus.end(), topo.efficiency_cpus.begin(), topo.efficiency_cpus.end());
+    }
+    
+    if (use_hyperthreading) {
+        // Use all candidate CPUs
+        selected_cpus = candidate_cpus;
+    } else {
+        // Select only one CPU per physical core
+        std::set<int> used_cores;
+        for (int cpu : candidate_cpus) {
+            // Find which core group this CPU belongs to
+            for (const auto& core_group : topo.core_siblings) {
+                if (std::find(core_group.begin(), core_group.end(), cpu) != core_group.end()) {
+                    // Use a hash of the core group to identify unique cores
+                    std::string core_id;
+                    for (int sibling : core_group) {
+                        core_id += std::to_string(sibling) + ",";
+                    }
+                    size_t core_hash = std::hash<std::string>{}(core_id);
+                    
+                    if (used_cores.find(core_hash) == used_cores.end()) {
+                        selected_cpus.push_back(cpu);
+                        used_cores.insert(core_hash);
+                    }
+                    break;
+                }
+            }
+        }
+    }
+    
+    // Validate selected CPUs by attempting to pin to them
+    int valid_count = 0;
+    cpu_set_t original_affinity;
+    pthread_getaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);
+    
+    for (int cpu : selected_cpus) {
+        if (pin_cpu(cpu) == 0) {
+            valid_count++;
+        }
+    }
+    
+    // Restore original affinity
+    pthread_setaffinity_np(pthread_self(), sizeof(original_affinity), &original_affinity);
+    
+    return valid_count;
 }
 
 #endif // __x86_64__ && __linux__
@@ -178,10 +276,40 @@ int32_t cpu_get_num_math() {
     if (n_cpu < 1) {
         return cpu_get_num_physical_cores();
     }
+    
+    if (is_hybrid_cpu()) {
+        cpu_set_t affinity;
+        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+            // Default behavior: use hyperthreading but not efficiency cores for math
+            // This can be overridden by environment variables or command-line options
+            bool use_hyperthreading = std::getenv("LLAMA_NO_HYPERTHREADING") == nullptr;
+            bool use_efficiency_cores = std::getenv("LLAMA_USE_EFFICIENCY_CORES") != nullptr;
+            
+            int result = cpu_count_math_cpus(n_cpu, use_hyperthreading, use_efficiency_cores);
+            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+            if (result > 0) {
+                return result;
+            }
+        }
+    }
+#endif
+    return cpu_get_num_physical_cores();
+}
+
+/**
+ * Returns number of CPUs on system that are useful for math, respecting cpu_params.
+ */
+int32_t cpu_get_num_math_from_params(const cpu_params & params) {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    if (n_cpu < 1) {
+        return cpu_get_num_physical_cores();
+    }
+    
     if (is_hybrid_cpu()) {
         cpu_set_t affinity;
         if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
-            int result = cpu_count_math_cpus(n_cpu);
+            int result = cpu_count_math_cpus(n_cpu, params.use_hyperthreading, params.use_efficiency_cores);
             pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
             if (result > 0) {
                 return result;
@@ -192,6 +320,62 @@ int32_t cpu_get_num_math() {
     return cpu_get_num_physical_cores();
 }
 
+/**
+ * Print CPU topology information for debugging
+ */
+void cpu_print_topology_info() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    if (is_hybrid_cpu()) {
+        cpu_topology_info topo = detect_cpu_topology();
+        
+        printf("CPU Topology Information:\n");
+        printf("  Total logical CPUs: %d\n", topo.total_logical_cpus);
+        printf("  Total physical cores: %d\n", topo.total_physical_cores);
+        printf("  Performance cores: %d\n", topo.performance_cores);
+        printf("  Efficiency cores: %d\n", topo.efficiency_cores);
+        
+        printf("  Performance CPU IDs: ");
+        for (size_t i = 0; i < topo.performance_cpus.size(); ++i) {
+            if (i > 0) printf(", ");
+            printf("%d", topo.performance_cpus[i]);
+        }
+        printf("\n");
+        
+        if (!topo.efficiency_cpus.empty()) {
+            printf("  Efficiency CPU IDs: ");
+            for (size_t i = 0; i < topo.efficiency_cpus.size(); ++i) {
+                if (i > 0) printf(", ");
+                printf("%d", topo.efficiency_cpus[i]);
+            }
+            printf("\n");
+        }
+        
+        printf("  Core sibling groups (hyperthreading):\n");
+        for (size_t i = 0; i < topo.core_siblings.size(); ++i) {
+            printf("    Core %zu: ", i);
+            for (size_t j = 0; j < topo.core_siblings[i].size(); ++j) {
+                if (j > 0) printf(", ");
+                printf("%d", topo.core_siblings[i][j]);
+            }
+            printf("\n");
+        }
+        
+        // Show what would be selected with different options
+        printf("\n  Thread count recommendations:\n");
+        printf("    Default (P-cores + hyperthreading): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, false));
+        printf("    Without hyperthreading: %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, false));
+        printf("    With E-cores (+ HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, true, true));
+        printf("    With E-cores (no HT): %d\n", cpu_count_math_cpus(topo.total_logical_cpus, false, true));
+    } else {
+        printf("CPU Topology: Non-hybrid CPU detected\n");
+        printf("  Physical cores: %d\n", cpu_get_num_physical_cores());
+        printf("  Logical CPUs: %d\n", (int)std::thread::hardware_concurrency());
+    }
+#else
+    printf("CPU topology detection not available on this platform\n");
+#endif
+}
+
 // Helper for setting process priority
 
 #if defined(_WIN32)
@@ -258,7 +442,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
         if (role_model != nullptr) {
             cpuparams = *role_model;
         } else {
-            cpuparams.n_threads = cpu_get_num_math();
+            cpuparams.n_threads = cpu_get_num_math_from_params(cpuparams);
         }
     }
 
diff --git a/common/common.h b/common/common.h
@@ -55,10 +55,14 @@ struct cpu_params {
     enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
     uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+    bool     use_hyperthreading          = true;    // Use hyperthreading/SMT for math operations (enabled by default)
+    bool     use_efficiency_cores        = false;   // Use efficiency cores (E-cores) for math operations
 };
 
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
+int32_t cpu_get_num_math_from_params(const cpu_params & params);
+void cpu_print_topology_info();
 
 //
 // Common params
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2853,7 +2853,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 #ifdef GGML_NUMA_MIRROR
     if (GGML_UNLIKELY(ggml_current_numa_node == -1)) {
         int thread_id = state->ith;
-
+        int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+
+        // Distribute threads evenly across NUMA nodes first, then assign CPUs within each node
+        int num_numa_nodes = numa_num_configured_nodes();
+        if (num_numa_nodes <= 0) num_numa_nodes = 1;
+        
+        // Calculate which NUMA node this thread should use
+        int target_numa_node = thread_id % num_numa_nodes;
+        
         bool cpumask[GGML_MAX_N_THREADS];
         memset(cpumask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
         for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
@@ -2863,17 +2871,34 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
 
         int cpuid = -1;
-        bool local_mask[GGML_MAX_N_THREADS];
-        int iter = 0;
-        for (int j = 0; j < thread_id; ++j) {
-            ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
+        
+        // Try to find a CPU on the target NUMA node
+        struct bitmask* node_cpus = numa_allocate_cpumask();
+        if (numa_node_to_cpus(target_numa_node, node_cpus) == 0) {
+            // Find the first available CPU on the target NUMA node that's also in our allowed set
+            for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
+                if (cpumask[i] && numa_bitmask_isbitset(node_cpus, i)) {
+                    cpuid = i;
+                    break;
+                }
+            }
         }
-        memset(local_mask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
-        ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
-        for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
-            if (local_mask[i]) {
-                cpuid = i;
-                break;
+        numa_free_cpumask(node_cpus);
+        
+        // Fallback: if we couldn't find a CPU on the target node, use the original algorithm
+        if (cpuid == -1) {
+            bool local_mask[GGML_MAX_N_THREADS];
+            int iter = 0;
+            for (int j = 0; j < thread_id; ++j) {
+                ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
+            }
+            memset(local_mask, 0, sizeof(bool) * GGML_MAX_N_THREADS);
+            ggml_thread_cpumask_next(cpumask, local_mask, true, &iter);
+            for (int i = 0; i < GGML_MAX_N_THREADS; ++i) {
+                if (local_mask[i]) {
+                    cpuid = i;
+                    break;
+                }
             }
         }
 
@@ -2891,8 +2916,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         struct bitmask* mask = numa_bitmask_alloc(numa_num_configured_nodes());
         numa_bitmask_setbit(mask, ggml_current_numa_node);
         numa_set_membind(mask);
+        numa_bitmask_free(mask);
 
-        GGML_LOG_INFO("thread_id = %02d, node = %d, cpuid = %02d\n", thread_id, ggml_current_numa_node, cpuid);
+        GGML_LOG_INFO("thread_id = %02d, target_node = %d, actual_node = %d, cpuid = %02d, n_threads = %d\n", 
+                     thread_id, target_numa_node, ggml_current_numa_node, cpuid, n_threads);
     }
 #endif // GGML_NUMA_MIRROR
 
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp