Skip to content

Commit c86ff8c

Browse files
Waiman-Longpaulmckrcu
authored andcommitted
clocksource: Avoid accidental unstable marking of clocksources
Since commit db3a34e ("clocksource: Retry clock read if long delays detected") and commit 2e27e79 ("clocksource: Reduce clocksource-skew threshold"), it is found that tsc clocksource fallback to hpet can sometimes happen on both Intel and AMD systems especially when they are running stressful benchmarking workloads. Of the 23 systems tested with a v5.14 kernel, 10 of them have switched to hpet clock source during the test run. The result of falling back to hpet is a drastic reduction of performance when running benchmarks. For example, the fio performance tests can drop up to 70% whereas the iperf3 performance can drop up to 80%. 4 hpet fallbacks happened during bootup. They were: [ 8.749399] clocksource: timekeeping watchdog on CPU13: hpet read-back delay of 263750ns, attempt 4, marking unstable [ 12.044610] clocksource: timekeeping watchdog on CPU19: hpet read-back delay of 186166ns, attempt 4, marking unstable [ 17.336941] clocksource: timekeeping watchdog on CPU28: hpet read-back delay of 182291ns, attempt 4, marking unstable [ 17.518565] clocksource: timekeeping watchdog on CPU34: hpet read-back delay of 252196ns, attempt 4, marking unstable Other fallbacks happen when the systems were running stressful benchmarks. For example: [ 2685.867873] clocksource: timekeeping watchdog on CPU117: hpet read-back delay of 57269ns, attempt 4, marking unstable [46215.471228] clocksource: timekeeping watchdog on CPU8: hpet read-back delay of 61460ns, attempt 4, marking unstable Commit 2e27e79 ("clocksource: Reduce clocksource-skew threshold"), changed the skew margin from 100us to 50us. I think this is too small and can easily be exceeded when running some stressful workloads on a thermally stressed system. So it is switched back to 100us. Even a maximum skew margin of 100us may be too small in for some systems when booting up especially if those systems are under thermal stress. To eliminate the case that the large skew is due to the system being too busy slowing down the reading of both the watchdog and the clocksource, an extra consecutive read of watchdog clock is being done to check this. The consecutive watchdog read delay is compared against WATCHDOG_MAX_SKEW/2. If the delay exceeds the limit, we assume that the system is just too busy. A warning will be printed to the console and the clock skew check is skipped for this round. Fixes: db3a34e ("clocksource: Retry clock read if long delays detected") Fixes: 2e27e79 ("clocksource: Reduce clocksource-skew threshold") Signed-off-by: Waiman Long <[email protected]> Signed-off-by: Paul E. McKenney <[email protected]>
1 parent 8c0abfd commit c86ff8c

File tree

1 file changed

+41
-9
lines changed

1 file changed

+41
-9
lines changed

kernel/time/clocksource.c

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ static u64 suspend_start;
107107
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
108108
* a lower bound for cs->uncertainty_margin values when registering clocks.
109109
*/
110-
#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC)
110+
#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
111111

112112
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
113113
static void clocksource_watchdog_work(struct work_struct *work);
@@ -205,17 +205,24 @@ EXPORT_SYMBOL_GPL(max_cswd_read_retries);
205205
static int verify_n_cpus = 8;
206206
module_param(verify_n_cpus, int, 0644);
207207

208-
static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
208+
enum wd_read_status {
209+
WD_READ_SUCCESS,
210+
WD_READ_UNSTABLE,
211+
WD_READ_SKIP
212+
};
213+
214+
static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
209215
{
210216
unsigned int nretries;
211-
u64 wd_end, wd_delta;
212-
int64_t wd_delay;
217+
u64 wd_end, wd_end2, wd_delta;
218+
int64_t wd_delay, wd_seq_delay;
213219

214220
for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
215221
local_irq_disable();
216222
*wdnow = watchdog->read(watchdog);
217223
*csnow = cs->read(cs);
218224
wd_end = watchdog->read(watchdog);
225+
wd_end2 = watchdog->read(watchdog);
219226
local_irq_enable();
220227

221228
wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
@@ -226,13 +233,34 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
226233
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
227234
smp_processor_id(), watchdog->name, nretries);
228235
}
229-
return true;
236+
return WD_READ_SUCCESS;
230237
}
238+
239+
/*
240+
* Now compute delay in consecutive watchdog read to see if
241+
* there is too much external interferences that cause
242+
* significant delay in reading both clocksource and watchdog.
243+
*
244+
* If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
245+
* report system busy, reinit the watchdog and skip the current
246+
* watchdog test.
247+
*/
248+
wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
249+
wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
250+
if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
251+
goto skip_test;
231252
}
232253

233254
pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
234255
smp_processor_id(), watchdog->name, wd_delay, nretries);
235-
return false;
256+
return WD_READ_UNSTABLE;
257+
258+
skip_test:
259+
pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
260+
smp_processor_id(), watchdog->name, wd_seq_delay);
261+
pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
262+
cs->name, wd_delay);
263+
return WD_READ_SKIP;
236264
}
237265

238266
static u64 csnow_mid;
@@ -356,6 +384,7 @@ static void clocksource_watchdog(struct timer_list *unused)
356384
int next_cpu, reset_pending;
357385
int64_t wd_nsec, cs_nsec;
358386
struct clocksource *cs;
387+
enum wd_read_status read_ret;
359388
u32 md;
360389

361390
spin_lock(&watchdog_lock);
@@ -373,9 +402,12 @@ static void clocksource_watchdog(struct timer_list *unused)
373402
continue;
374403
}
375404

376-
if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
377-
/* Clock readout unreliable, so give it up. */
378-
__clocksource_unstable(cs);
405+
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
406+
407+
if (read_ret != WD_READ_SUCCESS) {
408+
if (read_ret == WD_READ_UNSTABLE)
409+
/* Clock readout unreliable, so give it up. */
410+
__clocksource_unstable(cs);
379411
continue;
380412
}
381413

0 commit comments

Comments
 (0)