|
22 | 22 | #include <iostream> |
23 | 23 | #include <iterator> |
24 | 24 | #include <regex> |
| 25 | +#include <set> |
25 | 26 | #include <sstream> |
26 | 27 | #include <string> |
27 | 28 | #include <thread> |
@@ -116,10 +117,92 @@ int32_t cpu_get_num_physical_cores() { |
116 | 117 |
|
117 | 118 | return num_physical_cores > 0 ? num_physical_cores : default_threads; |
118 | 119 | #endif |
| 120 | + // Try to use accurate topology detection first |
| 121 | + int32_t topology_cores = cpu_detect_physical_cores_topology(); |
| 122 | + if (topology_cores > 0) { |
| 123 | + return topology_cores; |
| 124 | + } |
| 125 | + |
| 126 | + // Fallback to heuristic if topology detection failed |
119 | 127 | unsigned int n_threads = std::thread::hardware_concurrency(); |
120 | 128 | return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; |
121 | 129 | } |
122 | 130 |
|
| 131 | +int32_t cpu_detect_physical_cores_topology() { |
| 132 | + std::vector<int> physical_cores; |
| 133 | + if (cpu_get_physical_cores_topology(physical_cores)) { |
| 134 | + return static_cast<int32_t>(physical_cores.size()); |
| 135 | + } |
| 136 | + return 0; // Indicate detection failed |
| 137 | +} |
| 138 | + |
| 139 | +bool cpu_get_physical_cores_topology(std::vector<int> & physical_cores) { |
| 140 | + physical_cores.clear(); |
| 141 | + |
| 142 | +#if defined(__linux__) && !defined(__ANDROID__) |
| 143 | + // Use Linux sysfs topology detection for accurate physical core detection |
| 144 | + int num_cpus = std::thread::hardware_concurrency(); |
| 145 | + if (num_cpus <= 0) { |
| 146 | + return false; |
| 147 | + } |
| 148 | + |
| 149 | + std::set<int> processed_cpus; |
| 150 | + |
| 151 | + for (int cpu = 0; cpu < num_cpus; cpu++) { |
| 152 | + // Skip if we've already processed this CPU as part of another core's siblings |
| 153 | + if (processed_cpus.count(cpu) > 0) { |
| 154 | + continue; |
| 155 | + } |
| 156 | + |
| 157 | + std::string thread_siblings_path = "/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list"; |
| 158 | + std::ifstream siblings_file(thread_siblings_path); |
| 159 | + |
| 160 | + if (!siblings_file.is_open()) { |
| 161 | + // If we can't read topology for this CPU, skip it but don't mark as physical |
| 162 | + continue; |
| 163 | + } |
| 164 | + |
| 165 | + std::string siblings_str; |
| 166 | + if (std::getline(siblings_file, siblings_str)) { |
| 167 | + // Parse the comma-separated list of sibling threads |
| 168 | + std::vector<int> siblings; |
| 169 | + std::stringstream ss(siblings_str); |
| 170 | + std::string cpu_str; |
| 171 | + |
| 172 | + while (std::getline(ss, cpu_str, ',')) { |
| 173 | + try { |
| 174 | + int sibling_cpu = std::stoi(cpu_str); |
| 175 | + siblings.push_back(sibling_cpu); |
| 176 | + } catch (const std::exception &) { |
| 177 | + // Skip invalid entries |
| 178 | + } |
| 179 | + } |
| 180 | + |
| 181 | + if (!siblings.empty()) { |
| 182 | + // Sort siblings to ensure we always pick the lowest-numbered one as primary |
| 183 | + std::sort(siblings.begin(), siblings.end()); |
| 184 | + int primary_cpu = siblings[0]; |
| 185 | + |
| 186 | + // Only count this as a physical core if it's the current CPU (the lowest-numbered sibling) |
| 187 | + if (primary_cpu == cpu) { |
| 188 | + physical_cores.push_back(primary_cpu); |
| 189 | + } |
| 190 | + |
| 191 | + // Mark all siblings as processed so we don't consider them again |
| 192 | + for (int sibling : siblings) { |
| 193 | + processed_cpus.insert(sibling); |
| 194 | + } |
| 195 | + } |
| 196 | + } |
| 197 | + } |
| 198 | + |
| 199 | + return !physical_cores.empty(); |
| 200 | +#else |
| 201 | + // Not supported on this platform |
| 202 | + return false; |
| 203 | +#endif |
| 204 | +} |
| 205 | + |
123 | 206 | #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) |
124 | 207 | #include <pthread.h> |
125 | 208 |
|
@@ -269,12 +352,148 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) |
269 | 352 | } |
270 | 353 | } |
271 | 354 |
|
272 | | - if (n_set && n_set < cpuparams.n_threads) { |
| 355 | + // If a CPU mask is set, use the number of set CPUs as the thread count |
| 356 | + if (cpuparams.mask_valid && n_set > 0) { |
| 357 | + cpuparams.n_threads = n_set; |
| 358 | + } else if (n_set && n_set < cpuparams.n_threads) { |
273 | 359 | // Not enough set bits, may experience performance issues. |
274 | 360 | LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); |
275 | 361 | } |
276 | 362 | } |
277 | 363 |
|
| 364 | +bool cpu_mask_set_physical_cores_only(bool (&boolmask)[GGML_MAX_N_THREADS]) { |
| 365 | +#ifdef _WIN32 |
| 366 | + // Windows implementation would require different approach |
| 367 | + LOG_WRN("Physical core detection is not supported on Windows\n"); |
| 368 | + return false; |
| 369 | +#else |
| 370 | + std::memset(boolmask, false, sizeof(bool) * GGML_MAX_N_THREADS); |
| 371 | + |
| 372 | + // Use the common topology detection logic |
| 373 | + std::vector<int> physical_cores; |
| 374 | + if (!cpu_get_physical_cores_topology(physical_cores)) { |
| 375 | + // Fallback: if we couldn't detect topology, just use all CPUs |
| 376 | + int num_cpus = std::thread::hardware_concurrency(); |
| 377 | + for (int cpu = 0; cpu < num_cpus && cpu < GGML_MAX_N_THREADS; cpu++) { |
| 378 | + boolmask[cpu] = true; |
| 379 | + } |
| 380 | + LOG_WRN("Could not detect CPU topology, using all CPUs\n"); |
| 381 | + return false; |
| 382 | + } |
| 383 | + |
| 384 | + // Set the mask for detected physical cores |
| 385 | + for (int core_id : physical_cores) { |
| 386 | + if (core_id < GGML_MAX_N_THREADS) { |
| 387 | + boolmask[core_id] = true; |
| 388 | + } |
| 389 | + } |
| 390 | + |
| 391 | + LOG("Detected %zu physical cores (excluding hyperthreads): ", physical_cores.size()); |
| 392 | + for (size_t i = 0; i < physical_cores.size(); i++) { |
| 393 | + if (i > 0) LOG(", "); |
| 394 | + LOG("%d", physical_cores[i]); |
| 395 | + } |
| 396 | + LOG("\n"); |
| 397 | + |
| 398 | + return true; |
| 399 | +#endif |
| 400 | +} |
| 401 | + |
| 402 | +bool cpu_mask_set_physical_cores_with_hyperthreading(bool (&boolmask)[GGML_MAX_N_THREADS]) { |
| 403 | +#ifdef _WIN32 |
| 404 | + // Windows implementation would require different approach |
| 405 | + LOG_WRN("--cpu-use-hyperthreading is not supported on Windows\n"); |
| 406 | + return false; |
| 407 | +#else |
| 408 | + std::memset(boolmask, false, sizeof(bool) * GGML_MAX_N_THREADS); |
| 409 | + |
| 410 | + int num_cpus = std::thread::hardware_concurrency(); |
| 411 | + if (num_cpus <= 0) { |
| 412 | + return false; |
| 413 | + } |
| 414 | + |
| 415 | + // Use the common topology detection logic to get all CPU sibling relationships |
| 416 | + std::set<int> processed_cpus; |
| 417 | + std::vector<int> all_cores_and_siblings; |
| 418 | + |
| 419 | + for (int cpu = 0; cpu < num_cpus; cpu++) { |
| 420 | + // Skip if we've already processed this CPU as part of another core's siblings |
| 421 | + if (processed_cpus.count(cpu) > 0) { |
| 422 | + continue; |
| 423 | + } |
| 424 | + |
| 425 | + std::string thread_siblings_path = "/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings_list"; |
| 426 | + std::ifstream siblings_file(thread_siblings_path); |
| 427 | + |
| 428 | + if (!siblings_file.is_open()) { |
| 429 | + // If we can't read topology for this CPU, include it anyway |
| 430 | + all_cores_and_siblings.push_back(cpu); |
| 431 | + processed_cpus.insert(cpu); |
| 432 | + continue; |
| 433 | + } |
| 434 | + |
| 435 | + std::string siblings_str; |
| 436 | + if (std::getline(siblings_file, siblings_str)) { |
| 437 | + // Parse the comma-separated list of sibling threads |
| 438 | + std::vector<int> siblings; |
| 439 | + std::stringstream ss(siblings_str); |
| 440 | + std::string cpu_str; |
| 441 | + |
| 442 | + while (std::getline(ss, cpu_str, ',')) { |
| 443 | + try { |
| 444 | + int sibling_cpu = std::stoi(cpu_str); |
| 445 | + siblings.push_back(sibling_cpu); |
| 446 | + } catch (const std::exception &) { |
| 447 | + // Skip invalid entries |
| 448 | + } |
| 449 | + } |
| 450 | + |
| 451 | + if (!siblings.empty()) { |
| 452 | + // Include ALL siblings (both physical core and hyperthreads) |
| 453 | + for (int sibling : siblings) { |
| 454 | + all_cores_and_siblings.push_back(sibling); |
| 455 | + processed_cpus.insert(sibling); |
| 456 | + } |
| 457 | + } else { |
| 458 | + // Fallback: include this CPU if no siblings found |
| 459 | + all_cores_and_siblings.push_back(cpu); |
| 460 | + processed_cpus.insert(cpu); |
| 461 | + } |
| 462 | + } else { |
| 463 | + // Fallback: include this CPU if we can't read the file |
| 464 | + all_cores_and_siblings.push_back(cpu); |
| 465 | + processed_cpus.insert(cpu); |
| 466 | + } |
| 467 | + } |
| 468 | + |
| 469 | + if (all_cores_and_siblings.empty()) { |
| 470 | + // Fallback: if we couldn't detect topology, just use all CPUs |
| 471 | + for (int cpu = 0; cpu < num_cpus && cpu < GGML_MAX_N_THREADS; cpu++) { |
| 472 | + boolmask[cpu] = true; |
| 473 | + } |
| 474 | + LOG_WRN("Could not detect CPU topology, using all CPUs\n"); |
| 475 | + return false; |
| 476 | + } |
| 477 | + |
| 478 | + // Set the mask for all detected cores and their hyperthread siblings |
| 479 | + for (int cpu_id : all_cores_and_siblings) { |
| 480 | + if (cpu_id < GGML_MAX_N_THREADS) { |
| 481 | + boolmask[cpu_id] = true; |
| 482 | + } |
| 483 | + } |
| 484 | + |
| 485 | + LOG("Using %zu CPU cores including hyperthreads: ", all_cores_and_siblings.size()); |
| 486 | + std::sort(all_cores_and_siblings.begin(), all_cores_and_siblings.end()); |
| 487 | + for (size_t i = 0; i < all_cores_and_siblings.size(); i++) { |
| 488 | + if (i > 0) LOG(", "); |
| 489 | + LOG("%d", all_cores_and_siblings[i]); |
| 490 | + } |
| 491 | + LOG("\n"); |
| 492 | + |
| 493 | + return true; |
| 494 | +#endif |
| 495 | +} |
| 496 | + |
278 | 497 | bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { |
279 | 498 | size_t dash_loc = range.find('-'); |
280 | 499 | if (dash_loc == std::string::npos) { |
|
0 commit comments