From 5959f1b8c4384c2bcc4f9a6c6cd9f6d0e8f202f5 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Wed, 6 Aug 2025 02:01:19 +1000 Subject: [PATCH 1/2] runq: add anomaly checking for runqueues. Issues like missing IPIs or irq disablement for long, can create obervable anomalies on scheduler side. For example a scheduler IPI, missed/absent due to hypervisor issue, can cause migration/X threads on VMs to get stuck for ever, causing softlockup. Having a knowledge of these anomalies can help in locating the actual problem. For example following snippet shows that migration/23 thread was stuck in runq's wake_list. cpu: 23 has following tasks in its runq wake_list: task pid: 273301, comm: ora_ipc0_csadsd task pid: 281335, comm: ora_lms0_ccawsd task pid: 391691, comm: ora_ipc0_webprd task pid: 390722, comm: ora_ipc0_dppd1 task pid: 394144, comm: ora_lmd1_audprd task pid: 394450, comm: ora_lms1_rbsspr task pid: 393235, comm: ora_lmd0_etudpr task pid: 24469, comm: cvfwd task pid: 357613, comm: tnslsnr task pid: 351700, comm: ocssd.bin task pid: 394519, comm: ora_dia0_wspd1 task pid: 394307, comm: ora_lms1_wmsprd task pid: 394773, comm: ora_lms0_ccadmp task pid: 351141, comm: ocssd.bin task pid: 394690, comm: ora_lms0_wspd1 task pid: 351774, comm: ocssd.bin task pid: 351678, comm: ocssd.bin task pid: 351692, comm: ocssd.bin task pid: 351683, comm: ocssd.bin task pid: 351680, comm: ocssd.bin task pid: 351686, comm: ocssd.bin task pid: 351681, comm: ocssd.bin task pid: 351688, comm: ocssd.bin task pid: 150, comm: migration/23 This will block migration/X threads running on other CPUs. The helpers and corelens module added here can detect such issues. Signed-off-by: Imran Khan --- drgn_tools/runq.py | 313 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) diff --git a/drgn_tools/runq.py b/drgn_tools/runq.py index 4111da71..7dcf7925 100644 --- a/drgn_tools/runq.py +++ b/drgn_tools/runq.py @@ -1,19 +1,45 @@ # Copyright (c) 2023, Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import argparse +from typing import Iterator +from typing import Set from drgn import container_of from drgn import Object from drgn import Program +from drgn import sizeof from drgn.helpers.common import escape_ascii_string +from drgn.helpers.linux.bitops import for_each_set_bit from drgn.helpers.linux.cpumask import for_each_online_cpu from drgn.helpers.linux.list import list_for_each_entry +from drgn.helpers.linux.llist import llist_empty +from drgn.helpers.linux.llist import llist_for_each_entry from drgn.helpers.linux.percpu import per_cpu +from drgn.helpers.linux.rbtree import rbtree_inorder_for_each_entry from drgn_tools.corelens import CorelensModule from drgn_tools.task import task_lastrun2now from drgn_tools.util import timestamp_str +EXPLAIN_WAKELIST = """ +[TASKS ON WAKE LIST] + A runq's wakelist temporarily holds tasks that are about to be woken + up on that CPU. If this list has multiple tasks, it usually means that + this CPU has missed multiple scheduler IPIs. This can imply issues + like IRQs being disabled for too long, IRQ delivery issues between hypervisor + and VM, or some other issue. +""" +EXPLAIN_IDLE_RUNQ = """ +[IDLE RUN QUEUE] + An idle runq: + * should have current pid as 0. + * should have nr_running as 0. + * ideally should not have any tasks. + this is not necessarily an error because + scheduler may just be bringing it out of idle. +""" + + # List runqueus per cpu @@ -97,6 +123,293 @@ def run_queue(prog: Program) -> None: print() +def for_each_task_on_cfs_rq(prog: Program, cpu: int) -> Iterator[Object]: + """ + Iterate through all tasks, on a CPU's CFS runq. + + :param prog: drgn program + :param cpu: cpu number + :return: Iterator of ``struct task_struct *`` objects + """ + runq = per_cpu(prog["runqueues"], cpu) + cfs_rq = runq.cfs + rb_root = cfs_rq.tasks_timeline.rb_root + for se in rbtree_inorder_for_each_entry( + "struct sched_entity", rb_root, "run_node" + ): + task = container_of(se, "struct task_struct", "se") + yield task + + +def num_tasks_on_cfs_rq(prog: Program, cpu: int) -> int: + """ + Get number of tasks, on a CPU's CFS runq. + + :param cpu: cpu number + :return: number of tasks + """ + return len(list(for_each_task_on_cfs_rq(prog, cpu))) + + +def for_each_task_on_rt_rq(prog: Program, cpu: int) -> Iterator[Object]: + """ + Iterate through all tasks, on a CPU's RT runq. + + :param prog: drgn program + :param cpu: cpu number + :return: Iterator of ``struct task_struct *`` objects + """ + runq = per_cpu(prog["runqueues"], cpu) + rt_rq = runq.rt + active = rt_rq.active + queues = active.queue + for prio in for_each_set_bit(active.bitmap, sizeof(active.bitmap) * 8): + head = queues[prio] + if prio > 99: + break + + for se in list_for_each_entry( + "struct sched_rt_entity", head.address_of_(), "run_list" + ): + task = container_of(se, "struct task_struct", "rt") + yield task + + +def num_tasks_on_rt_rq(prog: Program, cpu: int) -> int: + """ + Get number of tasks, on a CPU's RT runq. + + :param cpu: cpu number + :return: number of tasks + """ + return len(list(for_each_task_on_rt_rq(prog, cpu))) + + +def for_each_task_on_dl_rq(prog: Program, cpu: int) -> Iterator[Object]: + """ + Iterate through all tasks, on a CPU's DL runq. + + :param prog: drgn program + :param cpu: cpu number + :return: Iterator of ``struct task_struct *`` objects + """ + runq = per_cpu(prog["runqueues"], cpu) + dl_rq = runq.dl + rb_root = dl_rq.root.rb_root + for dl in rbtree_inorder_for_each_entry( + "struct sched_dl_entity", rb_root, "rb_node" + ): + task = container_of(dl, "struct task_struct", "dl") + yield task + + +def num_tasks_on_dl_rq(prog: Program, cpu: int) -> int: + """ + Get number of tasks, on a CPU's DL runq. + + :param prog: drgn program + :param cpu: cpu number + :return: number of tasks + """ + return len(list(for_each_task_on_dl_rq(prog, cpu))) + + +def for_each_task_on_rq(prog: Program, cpu: int) -> Iterator[Object]: + """ + Iterate through all tasks, on a CPU's runq. + + :param prog: drgn program + :param cpu: cpu number + :return: Iterator of ``struct task_struct *`` objects + """ + + yield from for_each_task_on_rt_rq(prog, cpu) + yield from for_each_task_on_cfs_rq(prog, cpu) + yield from for_each_task_on_dl_rq(prog, cpu) + + +def for_each_task_on_rq_wake_list(prog: Program, cpu: int) -> Iterator[Object]: + """ + Iterate through all tasks, on a CPU's wake_list. + + A CPU's wake_list contains tasks, that are in the process of being woken + up and have not yet landed on a CPU runq after wakeup. + Tasks should not reside here for long. + + :param prog: drgn program + :param cpu: cpu number + :return: Iterator of ``struct task_struct *`` objects + """ + + runq = per_cpu(prog["runqueues"], cpu) + for task in llist_for_each_entry( + "struct task_struct", runq.wake_list.first, "wake_entry" + ): + yield task + + +def check_idle_runq(prog: Program, cpu: int, explanations: Set[str]) -> None: + """ + Check an idle runq. + + :param prog: drgn program + :param cpu: cpu number + """ + runq = per_cpu(prog["runqueues"], cpu) + if runq.curr.pid.value_() != 0: + return + + explanations.add(EXPLAIN_IDLE_RUNQ) + if runq.nr_running.value_() != 0: + print(f"Idle cpu: {cpu} has non-zero nr_running") + + nr_cfs_task = num_tasks_on_cfs_rq(prog, cpu) + nr_rt_task = num_tasks_on_rt_rq(prog, cpu) + nr_dl_task = num_tasks_on_dl_rq(prog, cpu) + + if nr_cfs_task != 0: + print(f"Idle cpu: {cpu} has {nr_cfs_task} tasks on CFS runq") + + if nr_rt_task != 0: + print(f"Idle cpu: {cpu} has {nr_rt_task} tasks on RT runq") + + if nr_dl_task != 0: + print(f"Idle cpu: {cpu} has {nr_dl_task} tasks on DL runq") + print("See IDLE RUN QUEUE below") + + +def check_runq_wakelist( + prog: Program, cpu: int, explanations: Set[str] +) -> None: + """ + Check runq's wakelist. + + :param prog: drgn program + :param cpu: cpu number + """ + runq = per_cpu(prog["runqueues"], cpu) + if llist_empty(runq.wake_list): + return + + print("\n") + explanations.add(EXPLAIN_WAKELIST) + print(f"cpu: {cpu} has following tasks in its runq wake_list:") + for task in for_each_task_on_rq_wake_list(prog, cpu): + print( + f"task pid: {task.pid.value_()}, comm: {task.comm.string_().decode()}" + ) + print("See TASKS ON WAKE LIST below") + + +def dump_rt_runq_wait_summary( + prog: Program, cpu: int, qduration_thresh_ms: int = 1000 +): + """ + Iterate through all tasks, on a CPU's runq and list tasks that have been queued + for greater than specified threshold in ms (default 1000 ms). + + :param prog: drgn Program + :param cpu: cpu number + :param qduration_thresh_ms: threshold for wait duration on runq + """ + runq = per_cpu(prog["runqueues"], cpu) + for task in for_each_task_on_rt_rq(prog, cpu): + try: + if ( + task.sched_info.last_queued.value_() > 0 + and task.sched_info.last_queued.value_() + > task.sched_info.last_arrival.value_() + ): + qduration = ( + runq.clock.value_() - task.sched_info.last_queued.value_() + ) + print( + f"cpu: {cpu} pid: {task.pid.value_()} prio: {task.prio.value_()} queued for {qduration} nsecs" + ) + except AttributeError: + pass + + +def dump_cfs_runq_wait_summary( + prog: Program, cpu: int, qduration_thresh_ms: int = 1000 +): + """ + Iterate through all tasks, on a CPU's runq and list tasks that have been queued + for greater than specified threshold in ms (default 1000 ms). + + :param prog: drgn Program + :param cpu: cpu number + :param qduration_thresh_ms: threshold for wait duration on runq + """ + runq = per_cpu(prog["runqueues"], cpu) + for task in for_each_task_on_cfs_rq(prog, cpu): + try: + if ( + task.sched_info.last_queued.value_() > 0 + and task.sched_info.last_queued.value_() + > task.sched_info.last_arrival.value_() + ): + qduration = ( + runq.clock.value_() - task.sched_info.last_queued.value_() + ) + print( + f"cpu: {cpu} pid: {task.pid.value_()} prio: {task.prio.value_()} queued for {qduration} nsecs" + ) + except AttributeError: + pass + + +def dump_runq_wait_summary( + prog: Program, cpu: int, qduration_thresh_ms: int = 1000 +): + """ + Iterate through all tasks, on a CPU's runq and list tasks that have been queued + for greater than specified threshold in ms (default 1000 ms). + + :param prog: drgn Program + :param cpu: cpu number + :param qduration_thresh_ms: threshold for wait duration on runq + """ + dump_cfs_runq_wait_summary(prog, cpu, qduration_thresh_ms) + dump_rt_runq_wait_summary(prog, cpu, qduration_thresh_ms) + + +def run_queue_check(prog: Program) -> None: + """ + Check and report runqueue anomalies. + + :param prog: drgn program + """ + + explanations: Set[str] = set() + for cpu in for_each_online_cpu(prog): + check_idle_runq(prog, cpu, explanations) + check_runq_wakelist(prog, cpu, explanations) + dump_runq_wait_summary(prog, cpu) + print("\n") + + if not explanations: + return + print("Note: found some possible run queue issues. Explanations below:") + for explanation in explanations: + print(explanation) + + +class RunQueueCheck(CorelensModule): + """ + Check and report runqueue anomalies. + The content of this report does not necessarily mean an + error condition or a problem. But it mentions things that + don'y happen usually, so should not be ignored. + """ + + name = "runqcheck" + live_ok = False + + def run(self, prog: Program, args: argparse.Namespace) -> None: + run_queue_check(prog) + + class RunQueue(CorelensModule): """ List process that are in RT and CFS queue. From 733f566bee2e79b9333d87230ee50dfc51657641 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Wed, 6 Aug 2025 11:42:41 +1000 Subject: [PATCH 2/2] runq: add test case for new runq helpers. Signed-off-by: Imran Khan --- tests/test_runq.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_runq.py b/tests/test_runq.py index 7d966df5..c4e4f585 100644 --- a/tests/test_runq.py +++ b/tests/test_runq.py @@ -9,3 +9,9 @@ def test_run_queue(prog): if ProgramFlags.IS_LIVE & prog.flags: return runq.run_queue(prog) + + +def test_run_queue_check(prog): + if ProgramFlags.IS_LIVE & prog.flags: + return + runq.run_queue_check(prog)