From 5959f1b8c4384c2bcc4f9a6c6cd9f6d0e8f202f5 Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Wed, 6 Aug 2025 02:01:19 +1000
Subject: [PATCH 1/2] runq: add anomaly checking for runqueues.

Issues like missing IPIs or irq disablement for long, can create
obervable anomalies on scheduler side. For example a scheduler
IPI, missed/absent due to hypervisor issue, can cause migration/X
threads on VMs to get stuck for ever, causing softlockup.
Having a knowledge of these anomalies can help in locating the
actual problem.

For example following snippet shows that migration/23 thread
was stuck in runq's wake_list.

    cpu: 23 has following tasks in its runq wake_list:
    task pid: 273301, comm:  ora_ipc0_csadsd
    task pid: 281335, comm:  ora_lms0_ccawsd
    task pid: 391691, comm:  ora_ipc0_webprd
    task pid: 390722, comm:  ora_ipc0_dppd1
    task pid: 394144, comm:  ora_lmd1_audprd
    task pid: 394450, comm:  ora_lms1_rbsspr
    task pid: 393235, comm:  ora_lmd0_etudpr
    task pid: 24469, comm:  cvfwd
    task pid: 357613, comm:  tnslsnr
    task pid: 351700, comm:  ocssd.bin
    task pid: 394519, comm:  ora_dia0_wspd1
    task pid: 394307, comm:  ora_lms1_wmsprd
    task pid: 394773, comm:  ora_lms0_ccadmp
    task pid: 351141, comm:  ocssd.bin
    task pid: 394690, comm:  ora_lms0_wspd1
    task pid: 351774, comm:  ocssd.bin
    task pid: 351678, comm:  ocssd.bin
    task pid: 351692, comm:  ocssd.bin
    task pid: 351683, comm:  ocssd.bin
    task pid: 351680, comm:  ocssd.bin
    task pid: 351686, comm:  ocssd.bin
    task pid: 351681, comm:  ocssd.bin
    task pid: 351688, comm:  ocssd.bin
    task pid: 150, comm:  migration/23

This will block migration/X threads running on other CPUs.

The helpers and corelens module added here can detect such issues.

Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
---
 drgn_tools/runq.py | 313 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 313 insertions(+)

diff --git a/drgn_tools/runq.py b/drgn_tools/runq.py
index 4111da71..7dcf7925 100644
--- a/drgn_tools/runq.py
+++ b/drgn_tools/runq.py
@@ -1,19 +1,45 @@
 # Copyright (c) 2023, Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 import argparse
+from typing import Iterator
+from typing import Set
 
 from drgn import container_of
 from drgn import Object
 from drgn import Program
+from drgn import sizeof
 from drgn.helpers.common import escape_ascii_string
+from drgn.helpers.linux.bitops import for_each_set_bit
 from drgn.helpers.linux.cpumask import for_each_online_cpu
 from drgn.helpers.linux.list import list_for_each_entry
+from drgn.helpers.linux.llist import llist_empty
+from drgn.helpers.linux.llist import llist_for_each_entry
 from drgn.helpers.linux.percpu import per_cpu
+from drgn.helpers.linux.rbtree import rbtree_inorder_for_each_entry
 
 from drgn_tools.corelens import CorelensModule
 from drgn_tools.task import task_lastrun2now
 from drgn_tools.util import timestamp_str
 
+EXPLAIN_WAKELIST = """
+[TASKS ON WAKE LIST]
+    A runq's wakelist temporarily holds tasks that are about to be woken
+    up on that CPU. If this list has multiple tasks, it usually means that
+    this CPU has missed multiple scheduler IPIs. This can imply issues
+    like IRQs being disabled for too long, IRQ delivery issues between hypervisor
+    and VM, or some other issue.
+"""
+EXPLAIN_IDLE_RUNQ = """
+[IDLE RUN QUEUE]
+    An idle runq:
+        * should have current pid as 0.
+        * should have nr_running as 0.
+        * ideally should not have any tasks.
+          this is not necessarily an error because
+          scheduler may just be bringing it out of idle.
+"""
+
+
 # List runqueus per cpu
 
 
@@ -97,6 +123,293 @@ def run_queue(prog: Program) -> None:
         print()
 
 
+def for_each_task_on_cfs_rq(prog: Program, cpu: int) -> Iterator[Object]:
+    """
+    Iterate through all tasks, on a CPU's CFS runq.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    :return: Iterator of ``struct task_struct *`` objects
+    """
+    runq = per_cpu(prog["runqueues"], cpu)
+    cfs_rq = runq.cfs
+    rb_root = cfs_rq.tasks_timeline.rb_root
+    for se in rbtree_inorder_for_each_entry(
+        "struct sched_entity", rb_root, "run_node"
+    ):
+        task = container_of(se, "struct task_struct", "se")
+        yield task
+
+
+def num_tasks_on_cfs_rq(prog: Program, cpu: int) -> int:
+    """
+    Get number of tasks, on a CPU's CFS runq.
+
+    :param cpu: cpu number
+    :return: number of tasks
+    """
+    return len(list(for_each_task_on_cfs_rq(prog, cpu)))
+
+
+def for_each_task_on_rt_rq(prog: Program, cpu: int) -> Iterator[Object]:
+    """
+    Iterate through all tasks, on a CPU's RT runq.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    :return: Iterator of ``struct task_struct *`` objects
+    """
+    runq = per_cpu(prog["runqueues"], cpu)
+    rt_rq = runq.rt
+    active = rt_rq.active
+    queues = active.queue
+    for prio in for_each_set_bit(active.bitmap, sizeof(active.bitmap) * 8):
+        head = queues[prio]
+        if prio > 99:
+            break
+
+        for se in list_for_each_entry(
+            "struct sched_rt_entity", head.address_of_(), "run_list"
+        ):
+            task = container_of(se, "struct task_struct", "rt")
+            yield task
+
+
+def num_tasks_on_rt_rq(prog: Program, cpu: int) -> int:
+    """
+    Get number of tasks, on a CPU's RT runq.
+
+    :param cpu: cpu number
+    :return: number of tasks
+    """
+    return len(list(for_each_task_on_rt_rq(prog, cpu)))
+
+
+def for_each_task_on_dl_rq(prog: Program, cpu: int) -> Iterator[Object]:
+    """
+    Iterate through all tasks, on a CPU's DL runq.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    :return: Iterator of ``struct task_struct *`` objects
+    """
+    runq = per_cpu(prog["runqueues"], cpu)
+    dl_rq = runq.dl
+    rb_root = dl_rq.root.rb_root
+    for dl in rbtree_inorder_for_each_entry(
+        "struct sched_dl_entity", rb_root, "rb_node"
+    ):
+        task = container_of(dl, "struct task_struct", "dl")
+        yield task
+
+
+def num_tasks_on_dl_rq(prog: Program, cpu: int) -> int:
+    """
+    Get number of tasks, on a CPU's DL runq.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    :return: number of tasks
+    """
+    return len(list(for_each_task_on_dl_rq(prog, cpu)))
+
+
+def for_each_task_on_rq(prog: Program, cpu: int) -> Iterator[Object]:
+    """
+    Iterate through all tasks, on a CPU's runq.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    :return: Iterator of ``struct task_struct *`` objects
+    """
+
+    yield from for_each_task_on_rt_rq(prog, cpu)
+    yield from for_each_task_on_cfs_rq(prog, cpu)
+    yield from for_each_task_on_dl_rq(prog, cpu)
+
+
+def for_each_task_on_rq_wake_list(prog: Program, cpu: int) -> Iterator[Object]:
+    """
+    Iterate through all tasks, on a CPU's wake_list.
+
+    A CPU's wake_list contains tasks, that are in the process of being woken
+    up and have not yet landed on a CPU runq after wakeup.
+    Tasks should not reside here for long.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    :return: Iterator of ``struct task_struct *`` objects
+    """
+
+    runq = per_cpu(prog["runqueues"], cpu)
+    for task in llist_for_each_entry(
+        "struct task_struct", runq.wake_list.first, "wake_entry"
+    ):
+        yield task
+
+
+def check_idle_runq(prog: Program, cpu: int, explanations: Set[str]) -> None:
+    """
+    Check an idle runq.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    """
+    runq = per_cpu(prog["runqueues"], cpu)
+    if runq.curr.pid.value_() != 0:
+        return
+
+    explanations.add(EXPLAIN_IDLE_RUNQ)
+    if runq.nr_running.value_() != 0:
+        print(f"Idle cpu: {cpu} has non-zero nr_running")
+
+    nr_cfs_task = num_tasks_on_cfs_rq(prog, cpu)
+    nr_rt_task = num_tasks_on_rt_rq(prog, cpu)
+    nr_dl_task = num_tasks_on_dl_rq(prog, cpu)
+
+    if nr_cfs_task != 0:
+        print(f"Idle cpu: {cpu} has {nr_cfs_task} tasks on CFS runq")
+
+    if nr_rt_task != 0:
+        print(f"Idle cpu: {cpu} has {nr_rt_task} tasks on RT runq")
+
+    if nr_dl_task != 0:
+        print(f"Idle cpu: {cpu} has {nr_dl_task} tasks on DL runq")
+    print("See IDLE RUN QUEUE below")
+
+
+def check_runq_wakelist(
+    prog: Program, cpu: int, explanations: Set[str]
+) -> None:
+    """
+    Check runq's wakelist.
+
+    :param prog: drgn program
+    :param cpu: cpu number
+    """
+    runq = per_cpu(prog["runqueues"], cpu)
+    if llist_empty(runq.wake_list):
+        return
+
+    print("\n")
+    explanations.add(EXPLAIN_WAKELIST)
+    print(f"cpu: {cpu} has following tasks in its runq wake_list:")
+    for task in for_each_task_on_rq_wake_list(prog, cpu):
+        print(
+            f"task pid: {task.pid.value_()}, comm:  {task.comm.string_().decode()}"
+        )
+    print("See TASKS ON WAKE LIST below")
+
+
+def dump_rt_runq_wait_summary(
+    prog: Program, cpu: int, qduration_thresh_ms: int = 1000
+):
+    """
+    Iterate through all tasks, on a CPU's runq and list tasks that have been queued
+    for greater than specified threshold in ms (default 1000 ms).
+
+    :param prog: drgn Program
+    :param cpu: cpu number
+    :param qduration_thresh_ms: threshold for wait duration on runq
+    """
+    runq = per_cpu(prog["runqueues"], cpu)
+    for task in for_each_task_on_rt_rq(prog, cpu):
+        try:
+            if (
+                task.sched_info.last_queued.value_() > 0
+                and task.sched_info.last_queued.value_()
+                > task.sched_info.last_arrival.value_()
+            ):
+                qduration = (
+                    runq.clock.value_() - task.sched_info.last_queued.value_()
+                )
+                print(
+                    f"cpu: {cpu} pid: {task.pid.value_()} prio: {task.prio.value_()} queued for {qduration} nsecs"
+                )
+        except AttributeError:
+            pass
+
+
+def dump_cfs_runq_wait_summary(
+    prog: Program, cpu: int, qduration_thresh_ms: int = 1000
+):
+    """
+    Iterate through all tasks, on a CPU's runq and list tasks that have been queued
+    for greater than specified threshold in ms (default 1000 ms).
+
+    :param prog: drgn Program
+    :param cpu: cpu number
+    :param qduration_thresh_ms: threshold for wait duration on runq
+    """
+    runq = per_cpu(prog["runqueues"], cpu)
+    for task in for_each_task_on_cfs_rq(prog, cpu):
+        try:
+            if (
+                task.sched_info.last_queued.value_() > 0
+                and task.sched_info.last_queued.value_()
+                > task.sched_info.last_arrival.value_()
+            ):
+                qduration = (
+                    runq.clock.value_() - task.sched_info.last_queued.value_()
+                )
+                print(
+                    f"cpu: {cpu} pid: {task.pid.value_()} prio: {task.prio.value_()} queued for {qduration} nsecs"
+                )
+        except AttributeError:
+            pass
+
+
+def dump_runq_wait_summary(
+    prog: Program, cpu: int, qduration_thresh_ms: int = 1000
+):
+    """
+    Iterate through all tasks, on a CPU's runq and list tasks that have been queued
+    for greater than specified threshold in ms (default 1000 ms).
+
+    :param prog: drgn Program
+    :param cpu: cpu number
+    :param qduration_thresh_ms: threshold for wait duration on runq
+    """
+    dump_cfs_runq_wait_summary(prog, cpu, qduration_thresh_ms)
+    dump_rt_runq_wait_summary(prog, cpu, qduration_thresh_ms)
+
+
+def run_queue_check(prog: Program) -> None:
+    """
+    Check and report runqueue anomalies.
+
+    :param prog: drgn program
+    """
+
+    explanations: Set[str] = set()
+    for cpu in for_each_online_cpu(prog):
+        check_idle_runq(prog, cpu, explanations)
+        check_runq_wakelist(prog, cpu, explanations)
+        dump_runq_wait_summary(prog, cpu)
+        print("\n")
+
+    if not explanations:
+        return
+    print("Note: found some possible run queue issues. Explanations below:")
+    for explanation in explanations:
+        print(explanation)
+
+
+class RunQueueCheck(CorelensModule):
+    """
+    Check and report runqueue anomalies.
+    The content of this report does not necessarily mean an
+    error condition or a problem. But it mentions things that
+    don'y happen usually, so should not be ignored.
+    """
+
+    name = "runqcheck"
+    live_ok = False
+
+    def run(self, prog: Program, args: argparse.Namespace) -> None:
+        run_queue_check(prog)
+
+
 class RunQueue(CorelensModule):
     """
     List process that are in RT and CFS queue.

From 733f566bee2e79b9333d87230ee50dfc51657641 Mon Sep 17 00:00:00 2001
From: Imran Khan <imran.f.khan@oracle.com>
Date: Wed, 6 Aug 2025 11:42:41 +1000
Subject: [PATCH 2/2] runq: add test case for new runq helpers.

Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
---
 tests/test_runq.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_runq.py b/tests/test_runq.py
index 7d966df5..c4e4f585 100644
--- a/tests/test_runq.py
+++ b/tests/test_runq.py
@@ -9,3 +9,9 @@ def test_run_queue(prog):
     if ProgramFlags.IS_LIVE & prog.flags:
         return
     runq.run_queue(prog)
+
+
+def test_run_queue_check(prog):
+    if ProgramFlags.IS_LIVE & prog.flags:
+        return
+    runq.run_queue_check(prog)