Skip to content

Commit aaf05e9

Browse files
MaxKellermannakpm00
authored andcommitted
kernel/watchdog: add /sys/kernel/{hard,soft}lockup_count
Patch series "sysfs: add counters for lockups and stalls", v2. Commits 9db89b4 ("exit: Expose "oops_count" to sysfs") and 8b05aa2 ("panic: Expose "warn_count" to sysfs") added counters for oopses and warnings to sysfs, and these two patches do the same for hard/soft lockups and RCU stalls. All of these counters are useful for monitoring tools to detect whether the machine is healthy. If the kernel has experienced a lockup or a stall, it's probably due to a kernel bug, and I'd like to detect that quickly and easily. There is currently no way to detect that, other than parsing dmesg. Or observing indirect effects: such as certain tasks not responding, but then I need to observe all tasks, and it may take a while until these effects become visible/measurable. I'd rather be able to detect the primary cause more quickly, possibly before everything falls apart. This patch (of 2): There is /proc/sys/kernel/hung_task_detect_count, /sys/kernel/warn_count and /sys/kernel/oops_count but there is no userspace-accessible counter for hard/soft lockups. Having this is useful for monitoring tools. Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Max Kellermann <[email protected]> Cc: Cc: Core Minyard <[email protected]> Cc: Doug Anderson <[email protected]> Cc: Joel Granados <[email protected]> Cc: Song Liu <[email protected]> Cc: Kees Cook <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent cc66e48 commit aaf05e9

File tree

3 files changed

+67
-0
lines changed

3 files changed

+67
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
What: /sys/kernel/hardlockup_count
2+
Date: May 2025
3+
KernelVersion: 6.16
4+
Contact: Linux kernel mailing list <[email protected]>
5+
Description:
6+
Shows how many times the system has detected a hard lockup since last boot.
7+
Available only if CONFIG_HARDLOCKUP_DETECTOR is enabled.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
What: /sys/kernel/softlockup_count
2+
Date: May 2025
3+
KernelVersion: 6.16
4+
Contact: Linux kernel mailing list <[email protected]>
5+
Description:
6+
Shows how many times the system has detected a soft lockup since last boot.
7+
Available only if CONFIG_SOFTLOCKUP_DETECTOR is enabled.

kernel/watchdog.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
6464
*/
6565
unsigned int __read_mostly hardlockup_panic =
6666
IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
67+
68+
#ifdef CONFIG_SYSFS
69+
70+
static unsigned int hardlockup_count;
71+
72+
static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
73+
char *page)
74+
{
75+
return sysfs_emit(page, "%u\n", hardlockup_count);
76+
}
77+
78+
static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count);
79+
80+
static __init int kernel_hardlockup_sysfs_init(void)
81+
{
82+
sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL);
83+
return 0;
84+
}
85+
86+
late_initcall(kernel_hardlockup_sysfs_init);
87+
88+
#endif // CONFIG_SYSFS
89+
6790
/*
6891
* We may not want to enable hard lockup detection by default in all cases,
6992
* for example when running the kernel as a guest on a hypervisor. In these
@@ -170,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
170193
unsigned int this_cpu = smp_processor_id();
171194
unsigned long flags;
172195

196+
#ifdef CONFIG_SYSFS
197+
++hardlockup_count;
198+
#endif
199+
173200
/* Only print hardlockups once. */
174201
if (per_cpu(watchdog_hardlockup_warned, cpu))
175202
return;
@@ -312,6 +339,28 @@ unsigned int __read_mostly softlockup_panic =
312339
static bool softlockup_initialized __read_mostly;
313340
static u64 __read_mostly sample_period;
314341

342+
#ifdef CONFIG_SYSFS
343+
344+
static unsigned int softlockup_count;
345+
346+
static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
347+
char *page)
348+
{
349+
return sysfs_emit(page, "%u\n", softlockup_count);
350+
}
351+
352+
static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count);
353+
354+
static __init int kernel_softlockup_sysfs_init(void)
355+
{
356+
sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL);
357+
return 0;
358+
}
359+
360+
late_initcall(kernel_softlockup_sysfs_init);
361+
362+
#endif // CONFIG_SYSFS
363+
315364
/* Timestamp taken after the last successful reschedule. */
316365
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
317366
/* Timestamp of the last softlockup report. */
@@ -743,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
743792
touch_ts = __this_cpu_read(watchdog_touch_ts);
744793
duration = is_softlockup(touch_ts, period_ts, now);
745794
if (unlikely(duration)) {
795+
#ifdef CONFIG_SYSFS
796+
++softlockup_count;
797+
#endif
798+
746799
/*
747800
* Prevent multiple soft-lockup reports if one cpu is already
748801
* engaged in dumping all cpu back traces.

0 commit comments

Comments
 (0)