Skip to content

Commit ab407a1

Browse files
committed
Merge tag 'clocksource.2023.02.06b' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into timers/core
Pull clocksource watchdog changes from Paul McKenney: o Improvements to clocksource-watchdog console messages. o Loosening of the clocksource-watchdog skew criteria to match those of NTP (500 parts per million, relaxed from 400 parts per million). If it is good enough for NTP, it is good enough for the clocksource watchdog. o Suspend clocksource-watchdog checking temporarily when high memory latencies are detected. This avoids the false-positive clock-skew events that have been seen on production systems running memory-intensive workloads. o On systems where the TSC is deemed trustworthy, use it as the watchdog timesource, but only when specifically requested using the tsc=watchdog kernel boot parameter. This permits clock-skew events to be detected, but avoids forcing workloads to use the slow HPET and ACPI PM timers. These last two timers are slow enough to cause systems to be needlessly marked bad on the one hand, and real skew does sometimes happen on production systems running production workloads on the other. And sometimes it is the fault of the TSC, or at least of the firmware that told the kernel to program the TSC with the wrong frequency. o Add a tsc=revalidate kernel boot parameter to allow the kernel to diagnose cases where the TSC hardware works fine, but was told by firmware to tick at the wrong frequency. Such cases are rare, but they really have happened on production systems. Link: https://lore.kernel.org/r/20230210193640.GA3325193@paulmck-ThinkPad-P17-Gen-1
2 parents 7b0f95f + 0051293 commit ab407a1

File tree

7 files changed

+123
-29
lines changed

7 files changed

+123
-29
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6369,6 +6369,16 @@
63696369
in situations with strict latency requirements (where
63706370
interruptions from clocksource watchdog are not
63716371
acceptable).
6372+
[x86] recalibrate: force recalibration against a HW timer
6373+
(HPET or PM timer) on systems whose TSC frequency was
6374+
obtained from HW or FW using either an MSR or CPUID(0x15).
6375+
Warn if the difference is more than 500 ppm.
6376+
[x86] watchdog: Use TSC as the watchdog clocksource with
6377+
which to check other HW timers (HPET or PM timer), but
6378+
only on systems where TSC has been deemed trustworthy.
6379+
This will be suppressed by an earlier tsc=nowatchdog and
6380+
can be overridden by a later tsc=nowatchdog. A console
6381+
message will flag any such suppression or overriding.
63726382

63736383
tsc_early_khz= [X86] Skip early TSC calibration and use the given
63746384
value instead. Useful when the early TSC frequency discovery

arch/x86/include/asm/time.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
extern void hpet_time_init(void);
99
extern void time_init(void);
1010
extern bool pit_timer_init(void);
11+
extern bool tsc_clocksource_watchdog_disabled(void);
1112

1213
extern struct clock_event_device *global_clock_event;
1314

arch/x86/kernel/hpet.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,6 +1091,8 @@ int __init hpet_enable(void)
10911091
if (!hpet_counting())
10921092
goto out_nohpet;
10931093

1094+
if (tsc_clocksource_watchdog_disabled())
1095+
clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
10941096
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
10951097

10961098
if (id & HPET_ID_LEGSUP) {

arch/x86/kernel/tsc.c

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
4848

4949
int tsc_clocksource_reliable;
5050

51+
static int __read_mostly tsc_force_recalibrate;
52+
5153
static u32 art_to_tsc_numerator;
5254
static u32 art_to_tsc_denominator;
5355
static u64 art_to_tsc_offset;
@@ -292,6 +294,7 @@ __setup("notsc", notsc_setup);
292294

293295
static int no_sched_irq_time;
294296
static int no_tsc_watchdog;
297+
static int tsc_as_watchdog;
295298

296299
static int __init tsc_setup(char *str)
297300
{
@@ -301,8 +304,22 @@ static int __init tsc_setup(char *str)
301304
no_sched_irq_time = 1;
302305
if (!strcmp(str, "unstable"))
303306
mark_tsc_unstable("boot parameter");
304-
if (!strcmp(str, "nowatchdog"))
307+
if (!strcmp(str, "nowatchdog")) {
305308
no_tsc_watchdog = 1;
309+
if (tsc_as_watchdog)
310+
pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
311+
__func__);
312+
tsc_as_watchdog = 0;
313+
}
314+
if (!strcmp(str, "recalibrate"))
315+
tsc_force_recalibrate = 1;
316+
if (!strcmp(str, "watchdog")) {
317+
if (no_tsc_watchdog)
318+
pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
319+
__func__);
320+
else
321+
tsc_as_watchdog = 1;
322+
}
306323
return 1;
307324
}
308325

@@ -1186,6 +1203,12 @@ static void __init tsc_disable_clocksource_watchdog(void)
11861203
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
11871204
}
11881205

1206+
bool tsc_clocksource_watchdog_disabled(void)
1207+
{
1208+
return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
1209+
tsc_as_watchdog && !no_tsc_watchdog;
1210+
}
1211+
11891212
static void __init check_system_tsc_reliable(void)
11901213
{
11911214
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1374,6 +1397,25 @@ static void tsc_refine_calibration_work(struct work_struct *work)
13741397
else
13751398
freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
13761399

1400+
/* Will hit this only if tsc_force_recalibrate has been set */
1401+
if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
1402+
1403+
/* Warn if the deviation exceeds 500 ppm */
1404+
if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
1405+
pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
1406+
pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
1407+
(unsigned long)tsc_khz / 1000,
1408+
(unsigned long)tsc_khz % 1000);
1409+
}
1410+
1411+
pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
1412+
hpet ? "HPET" : "PM_TIMER",
1413+
(unsigned long)freq / 1000,
1414+
(unsigned long)freq % 1000);
1415+
1416+
return;
1417+
}
1418+
13771419
/* Make sure we're within 1% */
13781420
if (abs(tsc_khz - freq) > tsc_khz/100)
13791421
goto out;
@@ -1407,8 +1449,10 @@ static int __init init_tsc_clocksource(void)
14071449
if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
14081450
return 0;
14091451

1410-
if (tsc_unstable)
1411-
goto unreg;
1452+
if (tsc_unstable) {
1453+
clocksource_unregister(&clocksource_tsc_early);
1454+
return 0;
1455+
}
14121456

14131457
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
14141458
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1421,9 +1465,10 @@ static int __init init_tsc_clocksource(void)
14211465
if (boot_cpu_has(X86_FEATURE_ART))
14221466
art_related_clocksource = &clocksource_tsc;
14231467
clocksource_register_khz(&clocksource_tsc, tsc_khz);
1424-
unreg:
14251468
clocksource_unregister(&clocksource_tsc_early);
1426-
return 0;
1469+
1470+
if (!tsc_force_recalibrate)
1471+
return 0;
14271472
}
14281473

14291474
schedule_delayed_work(&tsc_irqwork, 0);

drivers/clocksource/acpi_pm.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <linux/pci.h>
2424
#include <linux/delay.h>
2525
#include <asm/io.h>
26+
#include <asm/time.h>
2627

2728
/*
2829
* The I/O port the PMTMR resides at.
@@ -210,8 +211,9 @@ static int __init init_acpi_pm_clocksource(void)
210211
return -ENODEV;
211212
}
212213

213-
return clocksource_register_hz(&clocksource_acpi_pm,
214-
PMTMR_TICKS_PER_SEC);
214+
if (tsc_clocksource_watchdog_disabled())
215+
clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY;
216+
return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);
215217
}
216218

217219
/* We use fs_initcall because we want the PCI fixups to have run

kernel/time/Kconfig

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
200200
int "Clocksource watchdog maximum allowable skew (in μs)"
201201
depends on CLOCKSOURCE_WATCHDOG
202202
range 50 1000
203-
default 100
203+
default 125
204204
help
205205
Specify the maximum amount of allowable watchdog skew in
206206
microseconds before reporting the clocksource to be unstable.
207+
The default is based on a half-second clocksource watchdog
208+
interval and NTP's maximum frequency drift of 500 parts
209+
per million. If the clocksource is good enough for NTP,
210+
it is good enough for the clocksource watchdog!
207211

208212
endmenu
209213
endif

kernel/time/clocksource.c

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,11 @@ static char override_name[CS_NAME_LEN];
9595
static int finished_booting;
9696
static u64 suspend_start;
9797

98+
/*
99+
* Interval: 0.5sec.
100+
*/
101+
#define WATCHDOG_INTERVAL (HZ >> 1)
102+
98103
/*
99104
* Threshold: 0.0312s, when doubled: 0.0625s.
100105
* Also a default for cs->uncertainty_margin when registering clocks.
@@ -106,11 +111,14 @@ static u64 suspend_start;
106111
* clocksource surrounding a read of the clocksource being validated.
107112
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
108113
* a lower bound for cs->uncertainty_margin values when registering clocks.
114+
*
115+
* The default of 500 parts per million is based on NTP's limits.
116+
* If a clocksource is good enough for NTP, it is good enough for us!
109117
*/
110118
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
111119
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
112120
#else
113-
#define MAX_SKEW_USEC 100
121+
#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
114122
#endif
115123

116124
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
140148
static int clocksource_watchdog_kthread(void *data);
141149
static void __clocksource_change_rating(struct clocksource *cs, int rating);
142150

143-
/*
144-
* Interval: 0.5sec.
145-
*/
146-
#define WATCHDOG_INTERVAL (HZ >> 1)
147-
148151
static void clocksource_watchdog_work(struct work_struct *work)
149152
{
150153
/*
@@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
257260
goto skip_test;
258261
}
259262

260-
pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
261-
smp_processor_id(), watchdog->name, wd_delay, nretries);
263+
pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
264+
smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
262265
return WD_READ_UNSTABLE;
263266

264267
skip_test:
@@ -384,13 +387,23 @@ void clocksource_verify_percpu(struct clocksource *cs)
384387
}
385388
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
386389

390+
static inline void clocksource_reset_watchdog(void)
391+
{
392+
struct clocksource *cs;
393+
394+
list_for_each_entry(cs, &watchdog_list, wd_list)
395+
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
396+
}
397+
398+
387399
static void clocksource_watchdog(struct timer_list *unused)
388400
{
389401
u64 csnow, wdnow, cslast, wdlast, delta;
390402
int next_cpu, reset_pending;
391403
int64_t wd_nsec, cs_nsec;
392404
struct clocksource *cs;
393405
enum wd_read_status read_ret;
406+
unsigned long extra_wait = 0;
394407
u32 md;
395408

396409
spin_lock(&watchdog_lock);
@@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)
410423

411424
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
412425

413-
if (read_ret != WD_READ_SUCCESS) {
414-
if (read_ret == WD_READ_UNSTABLE)
415-
/* Clock readout unreliable, so give it up. */
416-
__clocksource_unstable(cs);
426+
if (read_ret == WD_READ_UNSTABLE) {
427+
/* Clock readout unreliable, so give it up. */
428+
__clocksource_unstable(cs);
417429
continue;
418430
}
419431

432+
/*
433+
* When WD_READ_SKIP is returned, it means the system is likely
434+
* under very heavy load, where the latency of reading
435+
* watchdog/clocksource is very big, and affect the accuracy of
436+
* watchdog check. So give system some space and suspend the
437+
* watchdog check for 5 minutes.
438+
*/
439+
if (read_ret == WD_READ_SKIP) {
440+
/*
441+
* As the watchdog timer will be suspended, and
442+
* cs->last could keep unchanged for 5 minutes, reset
443+
* the counters.
444+
*/
445+
clocksource_reset_watchdog();
446+
extra_wait = HZ * 300;
447+
break;
448+
}
449+
420450
/* Clocksource initialized ? */
421451
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
422452
atomic_read(&watchdog_reset_pending)) {
@@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)
443473
/* Check the deviation from the watchdog clocksource. */
444474
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
445475
if (abs(cs_nsec - wd_nsec) > md) {
476+
u64 cs_wd_msec;
477+
u64 wd_msec;
478+
u32 wd_rem;
479+
446480
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
447481
smp_processor_id(), cs->name);
448482
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
449483
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
450484
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
451485
cs->name, cs_nsec, csnow, cslast, cs->mask);
486+
cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
487+
wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
488+
pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
489+
cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
452490
if (curr_clocksource == cs)
453491
pr_warn(" '%s' is current clocksource.\n", cs->name);
454492
else if (curr_clocksource)
@@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)
512550
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
513551
*/
514552
if (!timer_pending(&watchdog_timer)) {
515-
watchdog_timer.expires += WATCHDOG_INTERVAL;
553+
watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
516554
add_timer_on(&watchdog_timer, next_cpu);
517555
}
518556
out:
@@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)
537575
watchdog_running = 0;
538576
}
539577

540-
static inline void clocksource_reset_watchdog(void)
541-
{
542-
struct clocksource *cs;
543-
544-
list_for_each_entry(cs, &watchdog_list, wd_list)
545-
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
546-
}
547-
548578
static void clocksource_resume_watchdog(void)
549579
{
550580
atomic_inc(&watchdog_reset_pending);

0 commit comments

Comments
 (0)