@@ -121,6 +121,8 @@ int32_t cpu_get_num_physical_cores() {
121121
122122#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
123123#include < pthread.h>
124+ #include < map>
125+ #include < set>
124126
125127static void cpuid (unsigned leaf, unsigned subleaf,
126128 unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
@@ -152,19 +154,115 @@ static bool is_running_on_efficiency_core(void) {
152154 return core_type == intel_atom;
153155}
154156
155- static int cpu_count_math_cpus (int n_cpu) {
156- int result = 0 ;
157- for (int cpu = 0 ; cpu < n_cpu; ++cpu) {
158- if (pin_cpu (cpu)) {
159- return -1 ;
157+ // Structure to hold detailed CPU topology information
158+ struct cpu_topology_info {
159+ int total_logical_cpus;
160+ int total_physical_cores;
161+ int performance_cores;
162+ int efficiency_cores;
163+ std::vector<std::vector<int >> core_siblings; // Groups of hyperthreaded CPUs
164+ std::vector<int > performance_cpus; // CPU IDs that are performance cores
165+ std::vector<int > efficiency_cpus; // CPU IDs that are efficiency cores
166+ };
167+
168+ static cpu_topology_info detect_cpu_topology () {
169+ cpu_topology_info info = {};
170+ info.total_logical_cpus = sysconf (_SC_NPROCESSORS_ONLN);
171+
172+ // Map to group CPUs by their thread siblings
173+ std::map<std::string, std::vector<int >> sibling_groups;
174+
175+ // Read topology information for each CPU
176+ for (int cpu = 0 ; cpu < info.total_logical_cpus ; ++cpu) {
177+ // Read thread siblings to identify hyperthreading groups
178+ std::ifstream siblings_file (" /sys/devices/system/cpu/cpu" + std::to_string (cpu) + " /topology/thread_siblings_list" );
179+ if (siblings_file.is_open ()) {
180+ std::string siblings_str;
181+ std::getline (siblings_file, siblings_str);
182+ sibling_groups[siblings_str].push_back (cpu);
160183 }
161- if (is_running_on_efficiency_core ()) {
162- continue ; // efficiency cores harm lockstep threading
184+
185+ // Test if this CPU is a performance or efficiency core
186+ if (pin_cpu (cpu) == 0 ) {
187+ if (is_running_on_efficiency_core ()) {
188+ info.efficiency_cpus .push_back (cpu);
189+ } else {
190+ info.performance_cpus .push_back (cpu);
191+ }
163192 }
164- ++cpu; // hyperthreading isn't useful for linear algebra
165- ++result;
166193 }
167- return result;
194+
195+ // Convert sibling groups to core_siblings vector
196+ for (const auto & group : sibling_groups) {
197+ info.core_siblings .push_back (group.second );
198+ }
199+
200+ info.total_physical_cores = info.core_siblings .size ();
201+ info.performance_cores = info.performance_cpus .size ();
202+ info.efficiency_cores = info.efficiency_cpus .size ();
203+
204+ return info;
205+ }
206+
207+ static int cpu_count_math_cpus (int n_cpu, bool use_hyperthreading = false , bool use_efficiency_cores = false ) {
208+ cpu_topology_info topo = detect_cpu_topology ();
209+
210+ std::vector<int > selected_cpus;
211+
212+ // First, select which types of cores to use
213+ std::vector<int > candidate_cpus;
214+ if (!use_efficiency_cores) {
215+ // Use only performance cores
216+ candidate_cpus = topo.performance_cpus ;
217+ } else {
218+ // Use all cores
219+ candidate_cpus.reserve (topo.total_logical_cpus );
220+ candidate_cpus.insert (candidate_cpus.end (), topo.performance_cpus .begin (), topo.performance_cpus .end ());
221+ candidate_cpus.insert (candidate_cpus.end (), topo.efficiency_cpus .begin (), topo.efficiency_cpus .end ());
222+ }
223+
224+ if (use_hyperthreading) {
225+ // Use all candidate CPUs
226+ selected_cpus = candidate_cpus;
227+ } else {
228+ // Select only one CPU per physical core
229+ std::set<int > used_cores;
230+ for (int cpu : candidate_cpus) {
231+ // Find which core group this CPU belongs to
232+ for (const auto & core_group : topo.core_siblings ) {
233+ if (std::find (core_group.begin (), core_group.end (), cpu) != core_group.end ()) {
234+ // Use a hash of the core group to identify unique cores
235+ std::string core_id;
236+ for (int sibling : core_group) {
237+ core_id += std::to_string (sibling) + " ," ;
238+ }
239+ size_t core_hash = std::hash<std::string>{}(core_id);
240+
241+ if (used_cores.find (core_hash) == used_cores.end ()) {
242+ selected_cpus.push_back (cpu);
243+ used_cores.insert (core_hash);
244+ }
245+ break ;
246+ }
247+ }
248+ }
249+ }
250+
251+ // Validate selected CPUs by attempting to pin to them
252+ int valid_count = 0 ;
253+ cpu_set_t original_affinity;
254+ pthread_getaffinity_np (pthread_self (), sizeof (original_affinity), &original_affinity);
255+
256+ for (int cpu : selected_cpus) {
257+ if (pin_cpu (cpu) == 0 ) {
258+ valid_count++;
259+ }
260+ }
261+
262+ // Restore original affinity
263+ pthread_setaffinity_np (pthread_self (), sizeof (original_affinity), &original_affinity);
264+
265+ return valid_count;
168266}
169267
170268#endif // __x86_64__ && __linux__
@@ -178,10 +276,40 @@ int32_t cpu_get_num_math() {
178276 if (n_cpu < 1 ) {
179277 return cpu_get_num_physical_cores ();
180278 }
279+
280+ if (is_hybrid_cpu ()) {
281+ cpu_set_t affinity;
282+ if (!pthread_getaffinity_np (pthread_self (), sizeof (affinity), &affinity)) {
283+ // Default behavior: use hyperthreading but not efficiency cores for math
284+ // This can be overridden by environment variables or command-line options
285+ bool use_hyperthreading = std::getenv (" LLAMA_NO_HYPERTHREADING" ) == nullptr ;
286+ bool use_efficiency_cores = std::getenv (" LLAMA_USE_EFFICIENCY_CORES" ) != nullptr ;
287+
288+ int result = cpu_count_math_cpus (n_cpu, use_hyperthreading, use_efficiency_cores);
289+ pthread_setaffinity_np (pthread_self (), sizeof (affinity), &affinity);
290+ if (result > 0 ) {
291+ return result;
292+ }
293+ }
294+ }
295+ #endif
296+ return cpu_get_num_physical_cores ();
297+ }
298+
299+ /* *
300+ * Returns number of CPUs on system that are useful for math, respecting cpu_params.
301+ */
302+ int32_t cpu_get_num_math_from_params (const cpu_params & params) {
303+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
304+ int n_cpu = sysconf (_SC_NPROCESSORS_ONLN);
305+ if (n_cpu < 1 ) {
306+ return cpu_get_num_physical_cores ();
307+ }
308+
181309 if (is_hybrid_cpu ()) {
182310 cpu_set_t affinity;
183311 if (!pthread_getaffinity_np (pthread_self (), sizeof (affinity), &affinity)) {
184- int result = cpu_count_math_cpus (n_cpu);
312+ int result = cpu_count_math_cpus (n_cpu, params. use_hyperthreading , params. use_efficiency_cores );
185313 pthread_setaffinity_np (pthread_self (), sizeof (affinity), &affinity);
186314 if (result > 0 ) {
187315 return result;
@@ -192,6 +320,62 @@ int32_t cpu_get_num_math() {
192320 return cpu_get_num_physical_cores ();
193321}
194322
323+ /* *
324+ * Print CPU topology information for debugging
325+ */
326+ void cpu_print_topology_info () {
327+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
328+ if (is_hybrid_cpu ()) {
329+ cpu_topology_info topo = detect_cpu_topology ();
330+
331+ printf (" CPU Topology Information:\n " );
332+ printf (" Total logical CPUs: %d\n " , topo.total_logical_cpus );
333+ printf (" Total physical cores: %d\n " , topo.total_physical_cores );
334+ printf (" Performance cores: %d\n " , topo.performance_cores );
335+ printf (" Efficiency cores: %d\n " , topo.efficiency_cores );
336+
337+ printf (" Performance CPU IDs: " );
338+ for (size_t i = 0 ; i < topo.performance_cpus .size (); ++i) {
339+ if (i > 0 ) printf (" , " );
340+ printf (" %d" , topo.performance_cpus [i]);
341+ }
342+ printf (" \n " );
343+
344+ if (!topo.efficiency_cpus .empty ()) {
345+ printf (" Efficiency CPU IDs: " );
346+ for (size_t i = 0 ; i < topo.efficiency_cpus .size (); ++i) {
347+ if (i > 0 ) printf (" , " );
348+ printf (" %d" , topo.efficiency_cpus [i]);
349+ }
350+ printf (" \n " );
351+ }
352+
353+ printf (" Core sibling groups (hyperthreading):\n " );
354+ for (size_t i = 0 ; i < topo.core_siblings .size (); ++i) {
355+ printf (" Core %zu: " , i);
356+ for (size_t j = 0 ; j < topo.core_siblings [i].size (); ++j) {
357+ if (j > 0 ) printf (" , " );
358+ printf (" %d" , topo.core_siblings [i][j]);
359+ }
360+ printf (" \n " );
361+ }
362+
363+ // Show what would be selected with different options
364+ printf (" \n Thread count recommendations:\n " );
365+ printf (" Default (P-cores + hyperthreading): %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , true , false ));
366+ printf (" Without hyperthreading: %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , false , false ));
367+ printf (" With E-cores (+ HT): %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , true , true ));
368+ printf (" With E-cores (no HT): %d\n " , cpu_count_math_cpus (topo.total_logical_cpus , false , true ));
369+ } else {
370+ printf (" CPU Topology: Non-hybrid CPU detected\n " );
371+ printf (" Physical cores: %d\n " , cpu_get_num_physical_cores ());
372+ printf (" Logical CPUs: %d\n " , (int )std::thread::hardware_concurrency ());
373+ }
374+ #else
375+ printf (" CPU topology detection not available on this platform\n " );
376+ #endif
377+ }
378+
195379// Helper for setting process priority
196380
197381#if defined(_WIN32)
@@ -258,7 +442,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
258442 if (role_model != nullptr ) {
259443 cpuparams = *role_model;
260444 } else {
261- cpuparams.n_threads = cpu_get_num_math ( );
445+ cpuparams.n_threads = cpu_get_num_math_from_params (cpuparams );
262446 }
263447 }
264448
0 commit comments