@@ -121,6 +121,8 @@ int32_t cpu_get_num_physical_cores() {
121121
122122#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
123123#include < pthread.h>
124+ #include < map>
125+ #include < set>
124126
125127static void cpuid (unsigned leaf, unsigned subleaf,
126128 unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
@@ -152,19 +154,116 @@ static bool is_running_on_efficiency_core(void) {
152154 return core_type == intel_atom;
153155}
154156
155- static int cpu_count_math_cpus (int n_cpu) {
156- int result = 0 ;
157- for (int cpu = 0 ; cpu < n_cpu; ++cpu) {
158- if (pin_cpu (cpu)) {
159- return -1 ;
157+ // Structure to hold detailed CPU topology information
158+ struct cpu_topology_info {
159+ int total_logical_cpus;
160+ int total_physical_cores;
161+ int performance_cores;
162+ int efficiency_cores;
163+ std::vector<std::vector<int >> core_siblings; // Groups of hyperthreaded CPUs
164+ std::vector<int > performance_cpus; // CPU IDs that are performance cores
165+ std::vector<int > efficiency_cpus; // CPU IDs that are efficiency cores
166+ };
167+
168+ static cpu_topology_info detect_cpu_topology () {
169+ cpu_topology_info info = {};
170+ info.total_logical_cpus = sysconf (_SC_NPROCESSORS_ONLN);
171+
172+ // Map to group CPUs by their thread siblings
173+ std::map<std::string, std::vector<int >> sibling_groups;
174+
175+ // Read topology information for each CPU
176+ for (int cpu = 0 ; cpu < info.total_logical_cpus ; ++cpu) {
177+ // Read thread siblings to identify hyperthreading groups
178+ std::ifstream siblings_file (" /sys/devices/system/cpu/cpu" + std::to_string (cpu) + " /topology/thread_siblings_list" );
179+ if (siblings_file.is_open ()) {
180+ std::string siblings_str;
181+ std::getline (siblings_file, siblings_str);
182+ sibling_groups[siblings_str].push_back (cpu);
160183 }
161- if (is_running_on_efficiency_core ()) {
162- continue ; // efficiency cores harm lockstep threading
184+
185+ // Test if this CPU is a performance or efficiency core
186+ if (pin_cpu (cpu) == 0 ) {
187+ if (is_running_on_efficiency_core ()) {
188+ info.efficiency_cpus .push_back (cpu);
189+ } else {
190+ info.performance_cpus .push_back (cpu);
191+ }
163192 }
164- ++cpu; // hyperthreading isn't useful for linear algebra
165- ++result;
166193 }
167- return result;
194+
195+ // Convert sibling groups to core_siblings vector
196+ for (const auto & group : sibling_groups) {
197+ info.core_siblings .push_back (group.second );
198+ }
199+
200+ info.total_physical_cores = info.core_siblings .size ();
201+ info.performance_cores = info.performance_cpus .size ();
202+ info.efficiency_cores = info.efficiency_cpus .size ();
203+
204+ return info;
205+ }
206+
207+ static int cpu_count_math_cpus (int n_cpu, bool use_hyperthreading = false , bool use_efficiency_cores = false ) {
208+ GGML_UNUSED (n_cpu);
209+ cpu_topology_info topo = detect_cpu_topology ();
210+
211+ std::vector<int > selected_cpus;
212+
213+ // First, select which types of cores to use
214+ std::vector<int > candidate_cpus;
215+ if (!use_efficiency_cores) {
216+ // Use only performance cores
217+ candidate_cpus = topo.performance_cpus ;
218+ } else {
219+ // Use all cores
220+ candidate_cpus.reserve (topo.total_logical_cpus );
221+ candidate_cpus.insert (candidate_cpus.end (), topo.performance_cpus .begin (), topo.performance_cpus .end ());
222+ candidate_cpus.insert (candidate_cpus.end (), topo.efficiency_cpus .begin (), topo.efficiency_cpus .end ());
223+ }
224+
225+ if (use_hyperthreading) {
226+ // Use all candidate CPUs
227+ selected_cpus = candidate_cpus;
228+ } else {
229+ // Select only one CPU per physical core
230+ std::set<int > used_cores;
231+ for (int cpu : candidate_cpus) {
232+ // Find which core group this CPU belongs to
233+ for (const auto & core_group : topo.core_siblings ) {
234+ if (std::find (core_group.begin (), core_group.end (), cpu) != core_group.end ()) {
235+ // Use a hash of the core group to identify unique cores
236+ std::string core_id;
237+ for (int sibling : core_group) {
238+ core_id += std::to_string (sibling) + " ," ;
239+ }
240+ size_t core_hash = std::hash<std::string>{}(core_id);
241+
242+ if (used_cores.find (core_hash) == used_cores.end ()) {
243+ selected_cpus.push_back (cpu);
244+ used_cores.insert (core_hash);
245+ }
246+ break ;
247+ }
248+ }
249+ }
250+ }
251+
252+ // Validate selected CPUs by attempting to pin to them
253+ int valid_count = 0 ;
254+ cpu_set_t original_affinity;
255+ pthread_getaffinity_np (pthread_self (), sizeof (original_affinity), &original_affinity);
256+
257+ for (int cpu : selected_cpus) {
258+ if (pin_cpu (cpu) == 0 ) {
259+ valid_count++;
260+ }
261+ }
262+
263+ // Restore original affinity
264+ pthread_setaffinity_np (pthread_self (), sizeof (original_affinity), &original_affinity);
265+
266+ return valid_count;
168267}
169268
170269#endif // __x86_64__ && __linux__
@@ -178,10 +277,40 @@ int32_t cpu_get_num_math() {
178277 if (n_cpu < 1 ) {
179278 return cpu_get_num_physical_cores ();
180279 }
280+
281+ if (is_hybrid_cpu ()) {
282+ cpu_set_t affinity;
283+ if (!pthread_getaffinity_np (pthread_self (), sizeof (affinity), &affinity)) {
284+ // Default behavior: use hyperthreading and efficiency cores for math
285+ // This can be overridden by environment variables or command-line options
286+ bool use_hyperthreading = std::getenv (" LLAMA_CPU_NO_HYPERTHREADING" ) == nullptr ;
287+ bool use_efficiency_cores = std::getenv (" LLAMA_CPU_NO_EFFICIENCY_CORES" ) == nullptr ;
288+
289+ int result = cpu_count_math_cpus (n_cpu, use_hyperthreading, use_efficiency_cores);
290+ pthread_setaffinity_np (pthread_self (), sizeof (affinity), &affinity);
291+ if (result > 0 ) {
292+ return result;
293+ }
294+ }
295+ }
296+ #endif
297+ return cpu_get_num_physical_cores ();
298+ }
299+
300+ /* *
301+ * Returns number of CPUs on system that are useful for math, respecting cpu_params.
302+ */
303+ int32_t cpu_get_num_math_from_params (const cpu_params & params) {
304+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
305+ int n_cpu = sysconf (_SC_NPROCESSORS_ONLN);
306+ if (n_cpu < 1 ) {
307+ return cpu_get_num_physical_cores ();
308+ }
309+
181310 if (is_hybrid_cpu ()) {
182311 cpu_set_t affinity;
183312 if (!pthread_getaffinity_np (pthread_self (), sizeof (affinity), &affinity)) {
184- int result = cpu_count_math_cpus (n_cpu);
313+ int result = cpu_count_math_cpus (n_cpu, params. use_hyperthreading , params. use_efficiency_cores );
185314 pthread_setaffinity_np (pthread_self (), sizeof (affinity), &affinity);
186315 if (result > 0 ) {
187316 return result;
@@ -192,6 +321,62 @@ int32_t cpu_get_num_math() {
192321 return cpu_get_num_physical_cores ();
193322}
194323
324+ /* *
325+ * Print CPU topology information for debugging
326+ */
327+ void cpu_print_topology_info () {
328+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
329+ if (is_hybrid_cpu ()) {
330+ cpu_topology_info topo = detect_cpu_topology ();
331+
332+ printf (" CPU Topology Information:\n " );
333+ printf (" Total logical CPUs: %d\n " , topo.total_logical_cpus );
334+ printf (" Total physical cores: %d\n " , topo.total_physical_cores );
335+ printf (" Performance cores: %d\n " , topo.performance_cores );
336+ printf (" Efficiency cores: %d\n " , topo.efficiency_cores );
337+
338+ printf (" Performance CPU IDs: " );
339+ for (size_t i = 0 ; i < topo.performance_cpus .size (); ++i) {
340+ if (i > 0 ) printf (" , " );
341+ printf (" %d" , topo.performance_cpus [i]);
342+ }
343+ printf (" \n " );
344+
345+ if (!topo.efficiency_cpus .empty ()) {
346+ printf (" Efficiency CPU IDs: " );
347+ for (size_t i = 0 ; i < topo.efficiency_cpus .size (); ++i) {
348+ if (i > 0 ) printf (" , " );
349+ printf (" %d" , topo.efficiency_cpus [i]);
350+ }
351+ printf (" \n " );
352+ }
353+
354+ printf (" Core sibling groups (hyperthreading):\n " );
355+ for (size_t i = 0 ; i < topo.core_siblings .size (); ++i) {
356+ printf (" Core %zu: " , i);
357+ for (size_t j = 0 ; j < topo.core_siblings [i].size (); ++j) {
358+ if (j > 0 ) printf (" , " );
359+ printf (" %d" , topo.core_siblings [i][j]);
360+ }
361+ printf (" \n " );
362+ }
363+
364+ // Show what would be selected with different options
365+ printf (" \n Thread count recommendations:\n " );
366+ printf (" Default (P-cores + hyperthreading): %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , true , false ));
367+ printf (" Without hyperthreading: %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , false , false ));
368+ printf (" With E-cores (+ HT): %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , true , true ));
369+ printf (" With E-cores (no HT): %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , false , true ));
370+ } else {
371+ printf (" CPU Topology: Non-hybrid CPU detected\n " );
372+ printf (" Physical cores: %d\n " , cpu_get_num_physical_cores ());
373+ printf (" Logical CPUs: %d\n " , (int )std::thread::hardware_concurrency ());
374+ }
375+ #else
376+ printf (" CPU topology detection not available on this platform\n " );
377+ #endif
378+ }
379+
195380// Helper for setting process priority
196381
197382#if defined(_WIN32)
@@ -258,7 +443,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
258443 if (role_model != nullptr ) {
259444 cpuparams = *role_model;
260445 } else {
261- cpuparams.n_threads = cpu_get_num_math ( );
446+ cpuparams.n_threads = cpu_get_num_math_from_params (cpuparams );
262447 }
263448 }
264449
0 commit comments