Skip to content

Commit 79eac27

Browse files
author
savesanketsw
committed
cpu_pnp_strategy changes
1 parent 80d0d6b commit 79eac27

File tree

5 files changed

+166
-25
lines changed

5 files changed

+166
-25
lines changed

common/arg.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
564564
params.cpuparams.priority = (enum ggml_sched_priority) prio;
565565
}
566566
));
567+
add_opt(common_arg(
568+
{ "-CPnP", "--cpu-pnp-strategy" }, "N",
569+
string_format("set CPU PnP strategy : 0-disabled, 1-efficiency (default: %d)\n", params.cpuparams.cpu_pnp_strategy),
570+
[](common_params& params, int strategy) {
571+
if (strategy < 0 || strategy > 1) {
572+
throw std::invalid_argument("invalid value");
573+
}
574+
params.cpuparams.cpu_pnp_strategy = (enum ggml_cpu_pnp_strategy)strategy;
575+
}
576+
));
577+
add_opt(common_arg(
578+
{ "-CPnPb", "--cpu-pnp-strategy-batch" }, "N",
579+
string_format("set CPU PnP strategy batch : 0-disabled, 1-efficiency (default: %d)\n", params.cpuparams.cpu_pnp_strategy),
580+
[](common_params& params, int strategy) {
581+
if (strategy < 0 || strategy > 1) {
582+
throw std::invalid_argument("invalid value");
583+
}
584+
params.cpuparams_batch.cpu_pnp_strategy = (enum ggml_cpu_pnp_strategy)strategy;
585+
}
586+
));
567587
add_opt(common_arg(
568588
{"--poll"}, "<0...100>",
569589
string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),

common/common.cpp

Lines changed: 102 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,77 @@ using json = nlohmann::ordered_json;
9797
// CPU utils
9898
//
9999

100+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
101+
102+
// Print CPU Information
103+
void print_cpu_info(const cpu_info& info) {
104+
LOG_INF("CPU Information:\n");
105+
LOG_INF("----------------\n");
106+
LOG_INF("Is Hybrid Architecture: %s\n", info.is_hybrid ? "Yes" : "No");
107+
LOG_INF("Number of Logical Cores: %d\n", info.num_logical_cores);
108+
LOG_INF("Number of Physical Cores: %d\n", info.num_physical_cores);
109+
LOG_INF("Number of Performance Cores (P-Cores): %d\n", info.num_p_cores);
110+
LOG_INF("Number of Efficient Cores (E-Cores): %d\n", info.num_e_cores);
111+
LOG_INF("\nE-Core Affinity Mask:\n");
112+
LOG_INF("%s\n", info.e_core_affinity_mask.to_string().c_str());
113+
LOG_INF("\nP-Core Affinity Mask:\n");
114+
LOG_INF("%s\n", info.p_core_affinity_mask.to_string().c_str());
115+
}
116+
117+
// Populate CPU Information
118+
int get_cpu_info(cpu_info& c_info) {
119+
DWORD buffer_size = 0;
120+
121+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
122+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
123+
return 0;
124+
}
125+
}
126+
127+
std::vector<char> buffer(buffer_size);
128+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
129+
return 0;
130+
}
131+
132+
c_info.num_physical_cores = 0;
133+
c_info.num_logical_cores = 0;
134+
c_info.num_e_cores = 0;
135+
c_info.num_p_cores = 0;
136+
137+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
138+
while (buffer_size > 0) {
139+
if (info->Relationship == RelationProcessorCore) {
140+
c_info.num_physical_cores++;
141+
for (int i = 0; i < info->Processor.GroupCount; ++i) {
142+
GROUP_AFFINITY *groupAffinity = &info->Processor.GroupMask[i];
143+
WORD groupNumber = groupAffinity->Group;
144+
KAFFINITY mask = groupAffinity->Mask;
145+
int baseIndex = groupNumber * 64;
146+
c_info.num_logical_cores += __popcnt64(mask);
147+
if (info->Processor.EfficiencyClass < 1) {
148+
c_info.e_core_affinity_mask |= (std::bitset<GGML_MAX_N_THREADS>(mask) << baseIndex);
149+
c_info.num_e_cores += __popcnt64(mask);
150+
} else {
151+
c_info.p_core_affinity_mask |= (std::bitset<GGML_MAX_N_THREADS>(mask) << baseIndex);
152+
c_info.num_p_cores += __popcnt64(mask);
153+
}
154+
}
155+
}
156+
157+
buffer_size -= info->Size;
158+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
159+
}
160+
161+
if (c_info.num_p_cores > 0 && c_info.num_e_cores > 0)
162+
c_info.is_hybrid = true;
163+
164+
return 1;
165+
}
166+
167+
#endif
168+
169+
170+
100171
int32_t cpu_get_num_physical_cores() {
101172
#ifdef __linux__
102173
// enumerate the set of thread siblings, num entries is num cores
@@ -131,29 +202,12 @@ int32_t cpu_get_num_physical_cores() {
131202
unsigned int n_threads_win = std::thread::hardware_concurrency();
132203
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
133204

134-
DWORD buffer_size = 0;
135-
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
136-
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
137-
return default_threads;
138-
}
139-
}
140-
141-
std::vector<char> buffer(buffer_size);
142-
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
205+
cpu_info info;
206+
if(!get_cpu_info(info))
143207
return default_threads;
144-
}
208+
else
209+
return info.num_physical_cores > 0 ? info.num_physical_cores : default_threads;
145210

146-
int32_t num_physical_cores = 0;
147-
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
148-
while (buffer_size > 0) {
149-
if (info->Relationship == RelationProcessorCore) {
150-
num_physical_cores += info->Processor.GroupCount;
151-
}
152-
buffer_size -= info->Size;
153-
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
154-
}
155-
156-
return num_physical_cores > 0 ? num_physical_cores : default_threads;
157211
#endif
158212
unsigned int n_threads = std::thread::hardware_concurrency();
159213
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -291,12 +345,36 @@ bool set_process_priority(enum ggml_sched_priority prio) {
291345
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
292346
int32_t n_set = 0;
293347

294-
if (cpuparams.n_threads < 0) {
348+
LOG_INF("n_threads: %d, cpu pnp strategy: %d\n", cpuparams.n_threads, cpuparams.cpu_pnp_strategy);
349+
350+
if (cpuparams.n_threads < 0 || cpuparams.cpu_pnp_strategy > 0) {
295351
// Assuming everything about cpuparams is invalid
296-
if (role_model != nullptr) {
352+
if (role_model != nullptr && cpuparams.cpu_pnp_strategy == 0) {
297353
cpuparams = *role_model;
298354
} else {
299-
cpuparams.n_threads = cpu_get_num_math();
355+
if(cpuparams.n_threads < 0)
356+
cpuparams.n_threads = cpu_get_num_math();
357+
358+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
359+
360+
if(cpuparams.cpu_pnp_strategy == GGML_CPU_PNP_STRATEGY_EFFICIENCY) {
361+
cpu_info info;
362+
if(get_cpu_info(info)){
363+
print_cpu_info(info);
364+
if(info.is_hybrid){
365+
LOG_INF("hybrid platform detected: applying strategy\n");
366+
if (cpuparams.n_threads > info.num_e_cores) {
367+
LOG_INF("overriding num threads: %d to num efficient cores %d\n", cpuparams.n_threads, info.num_e_cores);
368+
cpuparams.n_threads = info.num_e_cores;
369+
}
370+
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
371+
cpuparams.cpumask[i] = info.e_core_affinity_mask[i];
372+
}
373+
cpuparams.mask_valid = true;
374+
}
375+
}
376+
}
377+
#endif
300378
}
301379
}
302380

common/common.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <string>
88
#include <vector>
99
#include <sstream>
10+
#include <bitset>
1011

1112
#ifdef _WIN32
1213
#define DIRECTORY_SEPARATOR '\\'
@@ -45,11 +46,22 @@ struct common_control_vector_load_info;
4546
// CPU utils
4647
//
4748

49+
struct cpu_info {
50+
bool is_hybrid = false;
51+
int num_logical_cores = 0;
52+
int num_physical_cores = 0;
53+
int num_p_cores = 0;
54+
int num_e_cores = 0;
55+
std::bitset<GGML_MAX_N_THREADS> e_core_affinity_mask;
56+
std::bitset<GGML_MAX_N_THREADS> p_core_affinity_mask;
57+
};
58+
4859
struct cpu_params {
4960
int n_threads = -1;
5061
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
5162
bool mask_valid = false; // Default: any CPU
5263
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
64+
enum ggml_cpu_pnp_strategy cpu_pnp_strategy = GGML_CPU_PNP_STRATEGY_DISABLED; // CPU power and performance strategy
5365
bool strict_cpu = false; // Use strict CPU placement
5466
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
5567
};

ggml/include/ggml.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2169,6 +2169,11 @@ extern "C" {
21692169
GGML_SCHED_PRIO_REALTIME
21702170
};
21712171

2172+
enum ggml_cpu_pnp_strategy {
2173+
GGML_CPU_PNP_STRATEGY_DISABLED,
2174+
GGML_CPU_PNP_STRATEGY_EFFICIENCY
2175+
};
2176+
21722177
// threadpool params
21732178
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
21742179
struct ggml_threadpool_params {

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1318,10 +1318,11 @@ struct ggml_threadpool {
13181318
struct ggml_compute_state {
13191319
#ifndef GGML_USE_OPENMP
13201320
ggml_thread_t thrd;
1321-
bool cpumask[GGML_MAX_N_THREADS];
13221321
int last_graph;
13231322
bool pending;
13241323
#endif
1324+
bool cpumask[GGML_MAX_N_THREADS];
1325+
bool mask_valid;
13251326
struct ggml_threadpool * threadpool;
13261327
int ith;
13271328
};
@@ -14044,10 +14045,20 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
1404414045
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
1404514046
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
1404614047

14048+
// Check if cpu mask is valid
14049+
bool cpumask_valid = false;
14050+
for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
14051+
if (tpp->cpumask[i]) {
14052+
cpumask_valid = true;
14053+
break;
14054+
}
14055+
}
14056+
1404714057
memset(workers, 0, workers_size);
1404814058
for (int j = 0; j < tpp->n_threads; j++) {
1404914059
workers[j].threadpool = threadpool;
1405014060
workers[j].ith = j;
14061+
workers[j].mask_valid = cpumask_valid; // set mask_valid for worker threads use affinity
1405114062
}
1405214063

1405314064
threadpool->workers = workers;
@@ -14079,6 +14090,12 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
1407914090
}
1408014091
#endif // GGML_USE_OPENMP
1408114092

14093+
int32_t cpumask_iter = 0;
14094+
for (int j = 1; j < tpp->n_threads; j++) {
14095+
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
14096+
}
14097+
ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
14098+
1408214099
return threadpool;
1408314100
}
1408414101

@@ -14125,10 +14142,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1412514142
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
1412614143
}
1412714144

14145+
// If mask is valid for worker thread apply affinity
14146+
if(&threadpool->workers[omp_get_thread_num()].mask_valid)
14147+
ggml_thread_apply_affinity(&threadpool->workers[omp_get_thread_num()].cpumask);
14148+
1412814149
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
1412914150
}
1413014151
} else {
1413114152
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
14153+
14154+
// If mask is valid for main thread apply affinity
14155+
if(&threadpool->workers[omp_get_thread_num()].mask_valid)
14156+
ggml_thread_apply_affinity(&threadpool->workers[omp_get_thread_num()].cpumask);
14157+
1413214158
ggml_graph_compute_thread(&threadpool->workers[0]);
1413314159
}
1413414160
#else

0 commit comments

Comments
 (0)