Skip to content

Commit 14ee7df

Browse files
committed
Merge branch 'pm-cpuidle'
Merge updates of the teo cpuidle governor for 6.14-rc1 that clean it up and make the handling of short idle intervals in it consistent regardless of the properties of idle states supplied by the cpuidle driver. * pm-cpuidle: cpuidle: teo: Skip sleep length computation for low latency constraints cpuidle: teo: Replace time_span_ns with a flag cpuidle: teo: Simplify handling of total events count cpuidle: teo: Skip getting the sleep length if wakeups are very frequent cpuidle: teo: Simplify counting events used for tick management cpuidle: teo: Clarify two code comments cpuidle: teo: Drop local variable prev_intercept_idx cpuidle: teo: Combine candidate state index checks against 0 cpuidle: teo: Reorder candidate state index checks cpuidle: teo: Rearrange idle state lookup code
2 parents b865a84 + 59484c3 commit 14ee7df

File tree

1 file changed

+96
-101
lines changed
  • drivers/cpuidle/governors

1 file changed

+96
-101
lines changed

drivers/cpuidle/governors/teo.c

Lines changed: 96 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,7 @@
4141
* idle state 2, the third bin spans from the target residency of idle state 2
4242
* up to, but not including, the target residency of idle state 3 and so on.
4343
* The last bin spans from the target residency of the deepest idle state
44-
* supplied by the driver to the scheduler tick period length or to infinity if
45-
* the tick period length is less than the target residency of that state. In
46-
* the latter case, the governor also counts events with the measured idle
47-
* duration between the tick period length and the target residency of the
48-
* deepest idle state.
44+
* supplied by the driver to infinity.
4945
*
5046
* Two metrics called "hits" and "intercepts" are associated with each bin.
5147
* They are updated every time before selecting an idle state for the given CPU
@@ -60,6 +56,10 @@
6056
* into by the sleep length (these events are also referred to as "intercepts"
6157
* below).
6258
*
59+
* The governor also counts "intercepts" with the measured idle duration below
60+
* the tick period length and uses this information when deciding whether or not
61+
* to stop the scheduler tick.
62+
*
6363
* In order to select an idle state for a CPU, the governor takes the following
6464
* steps (modulo the possible latency constraint that must be taken into account
6565
* too):
@@ -105,6 +105,12 @@
105105

106106
#include "gov.h"
107107

108+
/*
109+
* Idle state exit latency threshold used for deciding whether or not to check
110+
* the time till the closest expected timer event.
111+
*/
112+
#define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2)
113+
108114
/*
109115
* The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
110116
* is used for decreasing metrics on a regular basis.
@@ -124,18 +130,20 @@ struct teo_bin {
124130

125131
/**
126132
* struct teo_cpu - CPU data used by the TEO cpuidle governor.
127-
* @time_span_ns: Time between idle state selection and post-wakeup update.
128133
* @sleep_length_ns: Time till the closest timer event (at the selection time).
129134
* @state_bins: Idle state data bins for this CPU.
130135
* @total: Grand total of the "intercepts" and "hits" metrics for all bins.
131-
* @tick_hits: Number of "hits" after TICK_NSEC.
136+
* @tick_intercepts: "Intercepts" before TICK_NSEC.
137+
* @short_idles: Wakeups after short idle periods.
138+
* @artificial_wakeup: Set if the wakeup has been triggered by a safety net.
132139
*/
133140
struct teo_cpu {
134-
s64 time_span_ns;
135141
s64 sleep_length_ns;
136142
struct teo_bin state_bins[CPUIDLE_STATE_MAX];
137143
unsigned int total;
138-
unsigned int tick_hits;
144+
unsigned int tick_intercepts;
145+
unsigned int short_idles;
146+
bool artificial_wakeup;
139147
};
140148

141149
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
@@ -152,38 +160,34 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
152160
s64 target_residency_ns;
153161
u64 measured_ns;
154162

155-
if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
163+
cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT;
164+
165+
if (cpu_data->artificial_wakeup) {
156166
/*
157-
* One of the safety nets has triggered or the wakeup was close
158-
* enough to the closest timer event expected at the idle state
159-
* selection time to be discarded.
167+
* If one of the safety nets has triggered, assume that this
168+
* might have been a long sleep.
160169
*/
161170
measured_ns = U64_MAX;
162171
} else {
163172
u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
164173

165-
/*
166-
* The computations below are to determine whether or not the
167-
* (saved) time till the next timer event and the measured idle
168-
* duration fall into the same "bin", so use last_residency_ns
169-
* for that instead of time_span_ns which includes the cpuidle
170-
* overhead.
171-
*/
172174
measured_ns = dev->last_residency_ns;
173175
/*
174176
* The delay between the wakeup and the first instruction
175177
* executed by the CPU is not likely to be worst-case every
176178
* time, so take 1/2 of the exit latency as a very rough
177179
* approximation of the average of it.
178180
*/
179-
if (measured_ns >= lat_ns)
181+
if (measured_ns >= lat_ns) {
180182
measured_ns -= lat_ns / 2;
181-
else
183+
if (measured_ns < RESIDENCY_THRESHOLD_NS)
184+
cpu_data->short_idles += PULSE;
185+
} else {
182186
measured_ns /= 2;
187+
cpu_data->short_idles += PULSE;
188+
}
183189
}
184190

185-
cpu_data->total = 0;
186-
187191
/*
188192
* Decay the "hits" and "intercepts" metrics for all of the bins and
189193
* find the bins that the sleep length and the measured idle duration
@@ -195,8 +199,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
195199
bin->hits -= bin->hits >> DECAY_SHIFT;
196200
bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
197201

198-
cpu_data->total += bin->hits + bin->intercepts;
199-
200202
target_residency_ns = drv->states[i].target_residency_ns;
201203

202204
if (target_residency_ns <= cpu_data->sleep_length_ns) {
@@ -206,38 +208,22 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
206208
}
207209
}
208210

209-
/*
210-
* If the deepest state's target residency is below the tick length,
211-
* make a record of it to help teo_select() decide whether or not
212-
* to stop the tick. This effectively adds an extra hits-only bin
213-
* beyond the last state-related one.
214-
*/
215-
if (target_residency_ns < TICK_NSEC) {
216-
cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
217-
218-
cpu_data->total += cpu_data->tick_hits;
219-
220-
if (TICK_NSEC <= cpu_data->sleep_length_ns) {
221-
idx_timer = drv->state_count;
222-
if (TICK_NSEC <= measured_ns) {
223-
cpu_data->tick_hits += PULSE;
224-
goto end;
225-
}
226-
}
227-
}
228-
211+
cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT;
229212
/*
230213
* If the measured idle duration falls into the same bin as the sleep
231214
* length, this is a "hit", so update the "hits" metric for that bin.
232215
* Otherwise, update the "intercepts" metric for the bin fallen into by
233216
* the measured idle duration.
234217
*/
235-
if (idx_timer == idx_duration)
218+
if (idx_timer == idx_duration) {
236219
cpu_data->state_bins[idx_timer].hits += PULSE;
237-
else
220+
} else {
238221
cpu_data->state_bins[idx_duration].intercepts += PULSE;
222+
if (TICK_NSEC <= measured_ns)
223+
cpu_data->tick_intercepts += PULSE;
224+
}
239225

240-
end:
226+
cpu_data->total -= cpu_data->total >> DECAY_SHIFT;
241227
cpu_data->total += PULSE;
242228
}
243229

@@ -285,14 +271,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
285271
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
286272
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
287273
ktime_t delta_tick = TICK_NSEC / 2;
288-
unsigned int tick_intercept_sum = 0;
289274
unsigned int idx_intercept_sum = 0;
290275
unsigned int intercept_sum = 0;
291276
unsigned int idx_hit_sum = 0;
292277
unsigned int hit_sum = 0;
293278
int constraint_idx = 0;
294279
int idx0 = 0, idx = -1;
295-
int prev_intercept_idx;
296280
s64 duration_ns;
297281
int i;
298282

@@ -301,10 +285,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
301285
dev->last_state_idx = -1;
302286
}
303287

304-
cpu_data->time_span_ns = local_clock();
305288
/*
306-
* Set the expected sleep length to infinity in case of an early
307-
* return.
289+
* Set the sleep length to infinity in case the invocation of
290+
* tick_nohz_get_sleep_length() below is skipped, in which case it won't
291+
* be known whether or not the subsequent wakeup is caused by a timer.
292+
* It is generally fine to count the wakeup as an intercept then, except
293+
* for the cases when the CPU is mostly woken up by timers and there may
294+
* be opportunities to ask for a deeper idle state when no imminent
295+
* timers are scheduled which may be missed.
308296
*/
309297
cpu_data->sleep_length_ns = KTIME_MAX;
310298

@@ -360,17 +348,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
360348
goto end;
361349
}
362350

363-
tick_intercept_sum = intercept_sum +
364-
cpu_data->state_bins[drv->state_count-1].intercepts;
365-
366351
/*
367352
* If the sum of the intercepts metric for all of the idle states
368353
* shallower than the current candidate one (idx) is greater than the
369354
* sum of the intercepts and hits metrics for the candidate state and
370-
* all of the deeper states a shallower idle state is likely to be a
355+
* all of the deeper states, a shallower idle state is likely to be a
371356
* better choice.
372357
*/
373-
prev_intercept_idx = idx;
374358
if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
375359
int first_suitable_idx = idx;
376360

@@ -396,41 +380,38 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
396380
* first enabled state that is deep enough.
397381
*/
398382
if (teo_state_ok(i, drv) &&
399-
!dev->states_usage[i].disable)
383+
!dev->states_usage[i].disable) {
400384
idx = i;
401-
else
402-
idx = first_suitable_idx;
403-
385+
break;
386+
}
387+
idx = first_suitable_idx;
404388
break;
405389
}
406390

407391
if (dev->states_usage[i].disable)
408392
continue;
409393

410-
if (!teo_state_ok(i, drv)) {
394+
if (teo_state_ok(i, drv)) {
411395
/*
412-
* The current state is too shallow, but if an
413-
* alternative candidate state has been found,
414-
* it may still turn out to be a better choice.
396+
* The current state is deep enough, but still
397+
* there may be a better one.
415398
*/
416-
if (first_suitable_idx != idx)
417-
continue;
418-
419-
break;
399+
first_suitable_idx = i;
400+
continue;
420401
}
421402

422-
first_suitable_idx = i;
403+
/*
404+
* The current state is too shallow, so if no suitable
405+
* states other than the initial candidate have been
406+
* found, give up (the remaining states to check are
407+
* shallower still), but otherwise the first suitable
408+
* state other than the initial candidate may turn out
409+
* to be preferable.
410+
*/
411+
if (first_suitable_idx == idx)
412+
break;
423413
}
424414
}
425-
if (!idx && prev_intercept_idx) {
426-
/*
427-
* We have to query the sleep length here otherwise we don't
428-
* know after wakeup if our guess was correct.
429-
*/
430-
duration_ns = tick_nohz_get_sleep_length(&delta_tick);
431-
cpu_data->sleep_length_ns = duration_ns;
432-
goto out_tick;
433-
}
434415

435416
/*
436417
* If there is a latency constraint, it may be necessary to select an
@@ -440,24 +421,39 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
440421
idx = constraint_idx;
441422

442423
/*
443-
* Skip the timers check if state 0 is the current candidate one,
444-
* because an immediate non-timer wakeup is expected in that case.
424+
* If either the candidate state is state 0 or its target residency is
425+
* low enough, there is basically nothing more to do, but if the sleep
426+
* length is not updated, the subsequent wakeup will be counted as an
427+
* "intercept" which may be problematic in the cases when timer wakeups
428+
* are dominant. Namely, it may effectively prevent deeper idle states
429+
* from being selected at one point even if no imminent timers are
430+
* scheduled.
431+
*
432+
* However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
433+
* CPU are unlikely (user space has a default 50 us slack value for
434+
* hrtimers and there are relatively few timers with a lower deadline
435+
* value in the kernel), and even if they did happen, the potential
436+
* benefit from using a deep idle state in that case would be
437+
* questionable anyway for latency reasons. Thus if the measured idle
438+
* duration falls into that range in the majority of cases, assume
439+
* non-timer wakeups to be dominant and skip updating the sleep length
440+
* to reduce latency.
441+
*
442+
* Also, if the latency constraint is sufficiently low, it will force
443+
* shallow idle states regardless of the wakeup type, so the sleep
444+
* length need not be known in that case.
445445
*/
446-
if (!idx)
447-
goto out_tick;
448-
449-
/*
450-
* If state 0 is a polling one, check if the target residency of
451-
* the current candidate state is low enough and skip the timers
452-
* check in that case too.
453-
*/
454-
if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
455-
drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
446+
if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
447+
(2 * cpu_data->short_idles >= cpu_data->total ||
448+
latency_req < LATENCY_THRESHOLD_NS))
456449
goto out_tick;
457450

458451
duration_ns = tick_nohz_get_sleep_length(&delta_tick);
459452
cpu_data->sleep_length_ns = duration_ns;
460453

454+
if (!idx)
455+
goto out_tick;
456+
461457
/*
462458
* If the closest expected timer is before the target residency of the
463459
* candidate state, a shallower one needs to be found.
@@ -474,7 +470,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
474470
* total wakeup events, do not stop the tick.
475471
*/
476472
if (drv->states[idx].target_residency_ns < TICK_NSEC &&
477-
tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
473+
cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8)
478474
duration_ns = TICK_NSEC / 2;
479475

480476
end:
@@ -511,17 +507,16 @@ static void teo_reflect(struct cpuidle_device *dev, int state)
511507
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
512508

513509
dev->last_state_idx = state;
514-
/*
515-
* If the wakeup was not "natural", but triggered by one of the safety
516-
* nets, assume that the CPU might have been idle for the entire sleep
517-
* length time.
518-
*/
519510
if (dev->poll_time_limit ||
520511
(tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
512+
/*
513+
* The wakeup was not "genuine", but triggered by one of the
514+
* safety nets.
515+
*/
521516
dev->poll_time_limit = false;
522-
cpu_data->time_span_ns = cpu_data->sleep_length_ns;
517+
cpu_data->artificial_wakeup = true;
523518
} else {
524-
cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
519+
cpu_data->artificial_wakeup = false;
525520
}
526521
}
527522

0 commit comments

Comments
 (0)