ggml-org
diff --git a/‎.github/instructions/numa-mirroring-implementation.md‎
Lines changed: 28 additions & 12 deletions b/‎.github/instructions/numa-mirroring-implementation.md‎
Lines changed: 28 additions & 12 deletions
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 10 additions & 0 deletions b/‎common/arg.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 220 additions & 1 deletion b/‎common/common.cpp‎
Lines changed: 220 additions & 1 deletion
diff --git a/‎common/common.h‎
Lines changed: 5 additions & 0 deletions b/‎common/common.h‎
Lines changed: 5 additions & 0 deletions
@@ -48,7 +48,6 @@ Clean integration point during model loading where NUMA mirrors are established
 #### `ggml/include/ggml.h`
 **Purpose**: Core tensor data access with NUMA-aware routing
 **Key additions**:
-- `#ifdef GGML_NUMA_MIRROR` conditional compilation blocks
 - NUMA mirror data structures in `ggml_tensor`
 - `tensor_set_data_with_numa_mirrors()` function declaration
 - Optimized `tensor_data()` function with fast path for non-NUMA tensors
@@ -94,9 +93,14 @@ Clean integration point during model loading where NUMA mirrors are established
 ## Build Configuration
 
 ### CMake Configuration
-Enable NUMA mirroring during build:
+Enable OpenMP during build:
 ```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NUMA_MIRROR=ON -DCMAKE_C_FLAGS="-march=native" -DCMAKE_CXX_FLAGS="-march=native"
+# Debug config (for debugging, obviously)
+cmake -B build -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=ON
+
+# Release config (for performance testing)
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-march=native" -DCMAKE_CXX_FLAGS="-march=native" -DGGML_OPENMP=ON
+
 cmake --build build --parallel
 ```
 
@@ -106,7 +110,6 @@ cmake --build build --parallel
 - **C++17 compiler**: Modern C++ standard support
 
 ### Compilation Flags
-- `GGML_NUMA_MIRROR=ON`: Enables NUMA mirroring functionality
 - `-march=native`: CPU-specific optimizations (recommended for maximum performance)
 - `CMAKE_BUILD_TYPE=Release`: Optimized release build
 
@@ -127,17 +130,17 @@ cmake --build build --parallel
 ## Implementation Details
 
 ### Tensor Data Access Optimization
+The `ggml_tensor` struct in `ggml.h` has been updated to no longer have a `data` field. This has been renamed to a `__data[]` array to hold pointers to multiple memory locations, with the index corresponding to the index of a local Numa node.
+
+Instead of directly addressing `tensor->data`, instead you do `tensor_data(tensor)`. And setting is done with `tensor_set_data()`. These are two new macros in `ggml.h`.
+
 The `tensor_data()` function in `ggml.h` has been optimized with a fast path:
 ```c
 static inline void * tensor_data(const struct ggml_tensor * tensor) {
-#ifdef GGML_NUMA_MIRROR
     if (tensor->numa_mirror_data == NULL) {
         return tensor->data;  // Fast path: no NUMA mirrors
     }
     return ggml_numa_get_tensor_data(tensor);  // NUMA-aware routing
-#else
-    return tensor->data;
-#endif
 }
 ```
 
@@ -163,6 +166,19 @@ Use `llama-bench` to measure NUMA benefits:
 ./llama-bench -m model.gguf --numa mirror
 ```
 
+There are models you can use for testing in our .devcontainer folder:
+
+.devcontainer/DeepSeek-R1-0528-UD-IQ3_XXS.gguf
+.devcontainer/gpt-oss-20b-UD-Q4_K_XL.gguf
+.devcontainer/qwen2.5-0.5b-instruct-q8_0.gguf
+.devcontainer/Qwen3-30B-A3B-UD-Q4_K_XL.gguf
+.devcontainer/Qwen3-32B-Q6_K.gguf
+
+Use qwen2.5-0.5b-instruct-q8_0.gguf for a quick verification run, while a bigger, dense model like Qwen3-32B-Q6_K.gguf will be good to test relative speed gains.
+
+If testing with `llama-cli`, always be sure to use the `--no-cnv` switch to prevent it from starting an interactive conversation.
+
+
 ### System Requirements Check
 Verify NUMA topology:
 ```bash
@@ -175,6 +191,7 @@ numactl --hardware
 Future versions may include:
 - Selective tensor mirroring policies
 - Custom NUMA node mapping
+- Limiting GGML threadpools to non-hyperthreaded cores
 
 ## Technical Notes
 
@@ -202,10 +219,9 @@ Future versions may include:
 
 ### Verification
 Confirm NUMA mirroring is working:
-1. Build with `GGML_NUMA_MIRROR=ON`
-2. Run `numactl --hardware` to verify multiple NUMA nodes
-3. Test with `GGML_NUMA_DEBUG=1` for debug output
-4. Compare performance with and without `--numa mirror`
+1. Run `numactl --hardware` to verify multiple NUMA nodes
+2. Test with `--verbose` for debug output
+3. Compare performance with and without `--numa mirror`
 
 ## Conclusion
 
 
@@ -148,3 +148,9 @@ poetry.toml
 /run-vim.sh
 /run-chat.sh
 .ccache/
+.devcontainer/devcontainer.json
+.devcontainer/Dockerfile
+.devcontainer/launch.json
+.devcontainer/README.md
+.devcontainer/tasks.json
+.devcontainer/zscaler.crt
@@ -1517,6 +1517,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.cpuparams.strict_cpu = std::stoul(value);
         }
     ));
+    add_opt(common_arg(
+        {"--cpu-use-hyperthreading"},
+        "use both physical CPU cores and their hyperthread siblings (default: physical cores only)",
+        [](common_params & params) {
+            params.cpuparams.mask_valid = true;
+            if (!cpu_mask_set_physical_cores_with_hyperthreading(params.cpuparams.cpumask)) {
+                LOG_WRN("Failed to detect CPU topology, using all available CPUs\n");
+            }
+        }
+    ));
     add_opt(common_arg(
         {"--prio"}, "N",
         string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
 
@@ -22,6 +22,7 @@
 #include <iostream>
 #include <iterator>
 #include <regex>
+#include <set>
 #include <sstream>
 #include <string>
 #include <thread>
@@ -116,10 +117,92 @@ int32_t cpu_get_num_physical_cores() {
 
     return num_physical_cores > 0 ? num_physical_cores : default_threads;
 #endif
+    // Try to use accurate topology detection first
+    int32_t topology_cores = cpu_detect_physical_cores_topology();
+    if (topology_cores > 0) {
+        return topology_cores;
+    }
+    
+    // Fallback to heuristic if topology detection failed
     unsigned int n_threads = std::thread::hardware_concurrency();
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
+int32_t cpu_detect_physical_cores_topology() {
+    std::vector<int> physical_cores;
+    if (cpu_get_physical_cores_topology(physical_cores)) {
+        return static_cast<int32_t>(physical_cores.size());
+    }
+    return 0; // Indicate detection failed
+}
+
+bool cpu_get_physical_cores_topology(std::vector<int> & physical_cores) {
+    physical_cores.clear();
+    
+#if defined(__linux__) && !defined(__ANDROID__)
+    // Use Linux sysfs topology detection for accurate physical core detection
+    int num_cpus = std::thread::hardware_concurrency();
+    if (num_cpus <= 0) {
+        return false;
+    }
+    
+    std::set<int> processed_cpus;
+    
+    for (int cpu = 0; cpu < num_cpus; cpu++) {
+        // Skip if we've already processed this CPU as part of another core's siblings
+        if (processed_cpus.count(cpu) > 0) {
+            continue;
+        }
+        
+        std::string thread_siblings_path = "/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list";
+        std::ifstream siblings_file(thread_siblings_path);
+        
+        if (!siblings_file.is_open()) {
+            // If we can't read topology for this CPU, skip it but don't mark as physical
+            continue;
+        }
+        
+        std::string siblings_str;
+        if (std::getline(siblings_file, siblings_str)) {
+            // Parse the comma-separated list of sibling threads
+            std::vector<int> siblings;
+            std::stringstream ss(siblings_str);
+            std::string cpu_str;
+            
+            while (std::getline(ss, cpu_str, ',')) {
+                try {
+                    int sibling_cpu = std::stoi(cpu_str);
+                    siblings.push_back(sibling_cpu);
+                } catch (const std::exception &) {
+                    // Skip invalid entries
+                }
+            }
+            
+            if (!siblings.empty()) {
+                // Sort siblings to ensure we always pick the lowest-numbered one as primary
+                std::sort(siblings.begin(), siblings.end());
+                int primary_cpu = siblings[0];
+                
+                // Only count this as a physical core if it's the current CPU (the lowest-numbered sibling)
+                if (primary_cpu == cpu) {
+                    physical_cores.push_back(primary_cpu);
+                }
+                
+                // Mark all siblings as processed so we don't consider them again
+                for (int sibling : siblings) {
+                    processed_cpus.insert(sibling);
+                }
+            }
+        }
+    }
+    
+    return !physical_cores.empty();
+#else
+    // Not supported on this platform
+    return false;
+#endif
+}
+
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
 #include <pthread.h>
 
@@ -269,12 +352,148 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
         }
     }
 
-    if (n_set && n_set < cpuparams.n_threads) {
+    // If a CPU mask is set, use the number of set CPUs as the thread count
+    if (cpuparams.mask_valid && n_set > 0) {
+        cpuparams.n_threads = n_set;
+    } else if (n_set && n_set < cpuparams.n_threads) {
         // Not enough set bits, may experience performance issues.
         LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
     }
 }
 
+bool cpu_mask_set_physical_cores_only(bool (&boolmask)[GGML_MAX_N_THREADS]) {
+#ifdef _WIN32
+    // Windows implementation would require different approach
+    LOG_WRN("Physical core detection is not supported on Windows\n");
+    return false;
+#else
+    std::memset(boolmask, false, sizeof(bool) * GGML_MAX_N_THREADS);
+    
+    // Use the common topology detection logic
+    std::vector<int> physical_cores;
+    if (!cpu_get_physical_cores_topology(physical_cores)) {
+        // Fallback: if we couldn't detect topology, just use all CPUs
+        int num_cpus = std::thread::hardware_concurrency();
+        for (int cpu = 0; cpu < num_cpus && cpu < GGML_MAX_N_THREADS; cpu++) {
+            boolmask[cpu] = true;
+        }
+        LOG_WRN("Could not detect CPU topology, using all CPUs\n");
+        return false;
+    }
+    
+    // Set the mask for detected physical cores
+    for (int core_id : physical_cores) {
+        if (core_id < GGML_MAX_N_THREADS) {
+            boolmask[core_id] = true;
+        }
+    }
+    
+    LOG("Detected %zu physical cores (excluding hyperthreads): ", physical_cores.size());
+    for (size_t i = 0; i < physical_cores.size(); i++) {
+        if (i > 0) LOG(", ");
+        LOG("%d", physical_cores[i]);
+    }
+    LOG("\n");
+    
+    return true;
+#endif
+}
+
+bool cpu_mask_set_physical_cores_with_hyperthreading(bool (&boolmask)[GGML_MAX_N_THREADS]) {
+#ifdef _WIN32
+    // Windows implementation would require different approach
+    LOG_WRN("--cpu-use-hyperthreading is not supported on Windows\n");
+    return false;
+#else
+    std::memset(boolmask, false, sizeof(bool) * GGML_MAX_N_THREADS);
+    
+    int num_cpus = std::thread::hardware_concurrency();
+    if (num_cpus <= 0) {
+        return false;
+    }
+    
+    // Use the common topology detection logic to get all CPU sibling relationships
+    std::set<int> processed_cpus;
+    std::vector<int> all_cores_and_siblings;
+    
+    for (int cpu = 0; cpu < num_cpus; cpu++) {
+        // Skip if we've already processed this CPU as part of another core's siblings
+        if (processed_cpus.count(cpu) > 0) {
+            continue;
+        }
+        
+        std::string thread_siblings_path = "/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list";
+        std::ifstream siblings_file(thread_siblings_path);
+        
+        if (!siblings_file.is_open()) {
+            // If we can't read topology for this CPU, include it anyway
+            all_cores_and_siblings.push_back(cpu);
+            processed_cpus.insert(cpu);
+            continue;
+        }
+        
+        std::string siblings_str;
+        if (std::getline(siblings_file, siblings_str)) {
+            // Parse the comma-separated list of sibling threads
+            std::vector<int> siblings;
+            std::stringstream ss(siblings_str);
+            std::string cpu_str;
+            
+            while (std::getline(ss, cpu_str, ',')) {
+                try {
+                    int sibling_cpu = std::stoi(cpu_str);
+                    siblings.push_back(sibling_cpu);
+                } catch (const std::exception &) {
+                    // Skip invalid entries
+                }
+            }
+            
+            if (!siblings.empty()) {
+                // Include ALL siblings (both physical core and hyperthreads)
+                for (int sibling : siblings) {
+                    all_cores_and_siblings.push_back(sibling);
+                    processed_cpus.insert(sibling);
+                }
+            } else {
+                // Fallback: include this CPU if no siblings found
+                all_cores_and_siblings.push_back(cpu);
+                processed_cpus.insert(cpu);
+            }
+        } else {
+            // Fallback: include this CPU if we can't read the file
+            all_cores_and_siblings.push_back(cpu);
+            processed_cpus.insert(cpu);
+        }
+    }
+    
+    if (all_cores_and_siblings.empty()) {
+        // Fallback: if we couldn't detect topology, just use all CPUs
+        for (int cpu = 0; cpu < num_cpus && cpu < GGML_MAX_N_THREADS; cpu++) {
+            boolmask[cpu] = true;
+        }
+        LOG_WRN("Could not detect CPU topology, using all CPUs\n");
+        return false;
+    }
+    
+    // Set the mask for all detected cores and their hyperthread siblings
+    for (int cpu_id : all_cores_and_siblings) {
+        if (cpu_id < GGML_MAX_N_THREADS) {
+            boolmask[cpu_id] = true;
+        }
+    }
+    
+    LOG("Using %zu CPU cores including hyperthreads: ", all_cores_and_siblings.size());
+    std::sort(all_cores_and_siblings.begin(), all_cores_and_siblings.end());
+    for (size_t i = 0; i < all_cores_and_siblings.size(); i++) {
+        if (i > 0) LOG(", ");
+        LOG("%d", all_cores_and_siblings[i]);
+    }
+    LOG("\n");
+    
+    return true;
+#endif
+}
+
 bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
     size_t dash_loc = range.find('-');
     if (dash_loc == std::string::npos) {
 
@@ -65,6 +65,10 @@ struct cpu_params {
 
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
+int32_t cpu_detect_physical_cores_topology(); // Detect actual physical cores using CPU topology
+bool cpu_get_physical_cores_topology(std::vector<int> & physical_cores); // Get list of physical core IDs
+bool cpu_mask_set_physical_cores_only(bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool cpu_mask_set_physical_cores_with_hyperthreading(bool(&boolmask)[GGML_MAX_N_THREADS]); // Set mask to include physical cores + hyperthread siblings
 
 //
 // Common params
@@ -513,6 +517,7 @@ std::string common_params_get_system_info(const common_params & params);
 
 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool cpu_mask_set_physical_cores_only(bool(&boolmask)[GGML_MAX_N_THREADS]);
 void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);