Skip to content

Commit a5db85d

Browse files
committed
Add fsnotify corelens module
This module aids in understanding the state of the fsnotify subsystem: what inodes / superblocks / vfsmounts are being watched, who is waiting for events, and in the case of fanotify, who is waiting on an access response from userspace. Signed-off-by: Stephen Brennan <[email protected]>
1 parent 5d38638 commit a5db85d

File tree

3 files changed

+409
-0
lines changed

3 files changed

+409
-0
lines changed

drgn_tools/fsnotify.py

Lines changed: 388 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,388 @@
1+
# Copyright (c) 2024, Oracle and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
3+
"""
4+
Helpers for diagnosing issues with dnotify, inotify, fanotify: the "fsnotify"
5+
subsystem.
6+
"""
7+
import argparse
8+
from typing import Dict
9+
from typing import Iterator
10+
from typing import Tuple
11+
12+
from drgn import cast
13+
from drgn import container_of
14+
from drgn import NULL
15+
from drgn import Object
16+
from drgn import Program
17+
from drgn.helpers.common.format import decode_flags
18+
from drgn.helpers.common.format import escape_ascii_string
19+
from drgn.helpers.linux.fs import for_each_file
20+
from drgn.helpers.linux.list import hlist_for_each_entry
21+
from drgn.helpers.linux.list import list_count_nodes
22+
from drgn.helpers.linux.list import list_for_each_entry
23+
from drgn.helpers.linux.pid import for_each_task
24+
from drgn.helpers.linux.slab import slab_object_info
25+
from drgn.helpers.linux.wait import waitqueue_active
26+
from drgn.helpers.linux.wait import waitqueue_for_each_entry
27+
28+
from drgn_tools.bt import bt
29+
from drgn_tools.corelens import CorelensModule
30+
from drgn_tools.dentry import dentry_path_any_mount
31+
from drgn_tools.dentry import sb_first_mount_point
32+
from drgn_tools.task import is_group_leader
33+
from drgn_tools.util import type_has_member
34+
35+
FSNOTIFY_FLAGS = {
36+
# Prefixed by "FS_" in the code: include/linux/fsnotify_backend.h
37+
# The prefix is removed for nicer printing.
38+
"ACCESS": 0x00000001, # File was accessed
39+
"MODIFY": 0x00000002, # File was modified
40+
"ATTRIB": 0x00000004, # Metadata changed
41+
"CLOSE_WRITE": 0x00000008, # Writtable file was closed
42+
"CLOSE_NOWRITE": 0x00000010, # Unwrittable file closed
43+
"OPEN": 0x00000020, # File was opened
44+
"MOVED_FROM": 0x00000040, # File was moved from X
45+
"MOVED_TO": 0x00000080, # File was moved to Y
46+
"CREATE": 0x00000100, # Subfile was created
47+
"DELETE": 0x00000200, # Subfile was deleted
48+
"DELETE_SELF": 0x00000400, # Self was deleted
49+
"MOVE_SELF": 0x00000800, # Self was moved
50+
"OPEN_EXEC": 0x00001000, # File was opened for exec
51+
"UNMOUNT": 0x00002000, # inode on umount fs
52+
"Q_OVERFLOW": 0x00004000, # Event queued overflowed
53+
"ERROR": 0x00008000, # Filesystem Error (fanotify)
54+
"OPEN_PERM": 0x00010000, # open event in an permission hook
55+
"ACCESS_PERM": 0x00020000, # access event in a permissions hook
56+
"OPEN_EXEC_PERM": 0x00040000, # open/exec event in a permission hook
57+
"EVENT_ON_CHILD": 0x08000000,
58+
"RENAME": 0x10000000, # File was renamed
59+
"DN_MULTISHOT": 0x20000000, # dnotify multishot
60+
"ISDIR": 0x40000000, # event occurred against dir
61+
}
62+
63+
64+
def fsnotify_group_for_each_mark(group: Object) -> Iterator[Object]:
65+
"""
66+
Iterate over all fsnotify marks for a given group.
67+
:param group: ``struct fsnotify_group *``
68+
:returns: iterator of ``struct fsnotify_mark *``
69+
"""
70+
return list_for_each_entry(
71+
"struct fsnotify_mark", group.marks_list.address_of_(), "g_list"
72+
)
73+
74+
75+
def fsnotify_mark_object(mark: Object) -> Tuple[str, Object]:
76+
"""
77+
For an fsnotify mark, determine what kind of object and return it
78+
79+
Fsnotify marks can be applied to an inode, superblock, or vfsmount. Identify
80+
which kind of object the mark is applied to, and return that along with a
81+
pointer to the object. If we don't understand the object type, then we
82+
return ("unknown", NULL).
83+
84+
:param mark: ``struct fsnotify-mark *``
85+
:returns: (object type, object pointer)
86+
"""
87+
conn = mark.connector
88+
prog = mark.prog_
89+
90+
if not hasattr(conn, "type"):
91+
# Commit d6f7b98bc8147 ("fsnotify: use type id to identify connector
92+
# object type") adds a type field to the connector. Before this, type
93+
# was expressed as bits in the flag field. The bit numbers were
94+
# preprocessor definitions, let's just hardcode them here.
95+
if conn.flags & 0x1:
96+
return "inode", conn.inode
97+
elif conn.flags & 0x2:
98+
return "vfsmount", conn.vfsmount
99+
else:
100+
return "unknown", NULL(prog, "void *")
101+
# See fsnotify_conn_{inode,mount,sb} in fs/notify/fsnotify.h
102+
if conn.type == prog.constant("FSNOTIFY_OBJ_TYPE_INODE"):
103+
# Prior to 36f10f55ff1d2 ("fsnotify: let connector point to an abstract
104+
# object"), there were direct pointers in the connector.
105+
if hasattr(conn, "inode"):
106+
return "inode", conn.inode
107+
return "inode", container_of(
108+
conn.obj, "struct inode", "i_fsnotify_marks"
109+
)
110+
elif conn.type == prog.constant("FSNOTIFY_OBJ_TYPE_VFSMOUNT"):
111+
# Prior to 36f10f55ff1d2 ("fsnotify: let connector point to an abstract
112+
# object"), there were direct pointers in the connector.
113+
if hasattr(conn, "vfsmount"):
114+
return "vfsmount", conn.vfsmount
115+
return "vfsmount", container_of(
116+
conn.obj, "struct mount", "mnt_fsnotify_marks"
117+
)
118+
elif conn.type == prog.constant("FSNOTIFY_OBJ_TYPE_SB"):
119+
# The "sb" object type was not present when 36f10f55ff1d2 ("fsnotify:
120+
# let connector point to an abstract object") so it will never have an
121+
# "sb" field.
122+
return "sb", container_of(
123+
conn.obj, "struct super_block", "s_fsnotify_marks"
124+
)
125+
else:
126+
return "unknown", NULL(prog, "void *")
127+
128+
129+
def hlist_first_entry_or_null(type: str, head: Object, field: str):
130+
# Return the first entry of an hlist, or NULL. Equivalent to the drgn
131+
# list_first_entry_or_null function, just a useful helper.
132+
for obj in hlist_for_each_entry(type, head, field):
133+
return obj
134+
return NULL(head.prog_, type + " *")
135+
136+
137+
def fsnotify_summarize_object(kind: str, obj: Object) -> str:
138+
"""
139+
Given an object marked by fsnotify, return a string representation
140+
141+
This is typically a file path: either the path to the watched file/dir, or
142+
the path to the mounted filesystem when a vfsmount or superblock. It should
143+
be noted that in all cases, there can be multiple paths (e.g. hard linked
144+
files, multiple mounts, etc). We output only one and hope it is useful.
145+
146+
:param kind: either inode, vfsmount, sb, or unknown
147+
:param obj: a corresponding drgn object (see :func:`fsnotify_mark_object()`)
148+
:returns: a string representation for printing to a user
149+
"""
150+
if kind == "inode":
151+
# Arbitrarily choose the first dentry for this inode, and further use
152+
# the first mount point all the way up the tree. We just want something
153+
# useful, not exhaustive.
154+
# 946e51f2bf37f ("move d_rcu from overlapping d_child to overlapping d_alias")
155+
field = (
156+
"d_alias"
157+
if type_has_member(obj.prog_, "struct dentry", "d_alias")
158+
else "d_u.d_alias"
159+
)
160+
dentry = hlist_first_entry_or_null(
161+
"struct dentry", obj.i_dentry.address_of_(), field
162+
)
163+
if dentry:
164+
return escape_ascii_string(dentry_path_any_mount(dentry))
165+
else:
166+
return "(ANON INODE)"
167+
elif kind == "vfsmount":
168+
fstype = obj.mnt.mnt_sb.s_type.name.string_().decode()
169+
path = escape_ascii_string(dentry_path_any_mount(obj.mnt_mountpoint))
170+
return f"FS:{fstype} MOUNT:{path}"
171+
pass
172+
elif kind == "sb":
173+
fstype = obj.s_type.name.string_().decode()
174+
first = sb_first_mount_point(obj)
175+
path = escape_ascii_string(dentry_path_any_mount(first))
176+
return f"SUPER:{fstype} ({path})"
177+
else:
178+
return "(not implemented)"
179+
180+
181+
def print_waitqueue(
182+
wq: Object, indent: int = 2, stack_trace: bool = False
183+
) -> None:
184+
"""
185+
Print the waiters of a waitqueue
186+
187+
This function enumerates all entries of a wait queue, and prints out
188+
information about each entry. Many entries are simply a task directly
189+
waiting. However, wait queues may be waited on by select and epoll objects,
190+
and probably other possibilities too. This function tries to print enough
191+
information to know who is waiting on a waitqueue, even if there's a select
192+
or epoll happening. Since epoll objects themselves could be waited upon,
193+
it's possible that this function will recursively call itself.
194+
195+
:param wq: the ``wait_queue_head_t`` object
196+
:param indent: indentation for the output
197+
:param stack_trace: whether to print stack trace for waiters
198+
"""
199+
if not waitqueue_active(wq):
200+
print(" <no waiters>")
201+
return
202+
prog = wq.prog_
203+
pfx = " " * indent
204+
for entry in waitqueue_for_each_entry(wq):
205+
func = "UNKNOWN"
206+
try:
207+
func = prog.symbol(entry.func.value_()).name
208+
except LookupError:
209+
pass
210+
211+
if func == "pollwake":
212+
wqueues = cast("struct poll_wqueues *", entry.private)
213+
task = wqueues.polling_task
214+
print(
215+
f"{pfx}[PID: {task.pid.value_()} COMM: {task.comm.string_().decode()} WAIT: select]"
216+
)
217+
if stack_trace:
218+
bt(task, indent=indent + 2)
219+
elif func == "ep_poll_callback":
220+
epitem = container_of(entry, "struct eppoll_entry", "wait").base
221+
ep = epitem.ep
222+
print(f"{pfx}[EVENTPOLL: {ep.value_():x}]")
223+
found_waiter = False
224+
if waitqueue_active(ep.wq):
225+
print(f"{pfx}Waiting in epoll_wait():")
226+
print_waitqueue(ep.wq, indent + 2, stack_trace=stack_trace)
227+
found_waiter = True
228+
if waitqueue_active(ep.poll_wait):
229+
print(f"{pfx}Waiting in file->poll():")
230+
print_waitqueue(
231+
ep.poll_wait, indent + 2, stack_trace=stack_trace
232+
)
233+
found_waiter = True
234+
if not found_waiter:
235+
print(f"{pfx}No waiters found.")
236+
else:
237+
info = slab_object_info(entry.private)
238+
if info and info.slab_cache.name.string_() == b"task_struct":
239+
task = cast("struct task_struct *", entry.private)
240+
print(
241+
f"{pfx}[PID: {task.pid.value_()} COMM: {task.comm.string_().decode()} WAIT: direct]"
242+
)
243+
if stack_trace:
244+
bt(task, indent=indent + 2)
245+
246+
247+
def fsnotify_group_report(
248+
group: Object, group_kind: str, verbose: int = 1
249+
) -> None:
250+
"""
251+
Print a report about an fsnotify group.
252+
:param group: ``struct fsnotify_group *``
253+
:param group_kind: either inotify or fanotify
254+
:param verbose: a verbosity level:
255+
0: summarize only
256+
1: output vfsmounts and super blocks, and a limited number of inodes
257+
2: same as above, but also include stack traces for waiters
258+
3: output every marked inode (this could be a very large amount)
259+
"""
260+
print(f"FSNOTIFY GROUP: {group.value_():x}")
261+
kind_counts: Dict[str, int] = {}
262+
for mark in fsnotify_group_for_each_mark(group):
263+
kind, ptr = fsnotify_mark_object(mark)
264+
kind_counts[kind] = kind_counts.get(kind, 0) + 1
265+
mask = decode_flags(
266+
mark.mask, FSNOTIFY_FLAGS.items(), bit_numbers=False
267+
)
268+
# 8e17bf975102c ("fanotify: prepare for setting event flags in ignore
269+
# mask")
270+
try:
271+
ignore_mask = decode_flags(
272+
mark.ignore_mask, FSNOTIFY_FLAGS.items(), bit_numbers=False
273+
)
274+
except AttributeError:
275+
ignore_mask = decode_flags(
276+
mark.ignored_mask, FSNOTIFY_FLAGS.items(), bit_numbers=False
277+
)
278+
try:
279+
count = mark.refcnt.refs.counter.value_()
280+
except AttributeError:
281+
# 7761daa6a1599 ("fsnotify: convert fsnotify_group.refcnt from
282+
# atomic_t to refcount_t")
283+
count = mark.refcnt.counter.value_()
284+
summary = fsnotify_summarize_object(kind, ptr)
285+
if verbose < 1:
286+
continue
287+
if verbose < 3 and kind == "inode":
288+
if kind_counts[kind] == 10:
289+
print(
290+
" <note: skipped printing inodes, use verbose to see all>"
291+
)
292+
if kind_counts[kind] >= 10:
293+
continue
294+
print(f" MARK: {kind} {ptr.value_():x} {summary}")
295+
print(f" CNT:{count} MASK:{mask} IGN:{ignore_mask}")
296+
print(
297+
"OBJECT SUMMARY: "
298+
+ ", ".join(f"{kind}: {count}" for kind, count in kind_counts.items())
299+
)
300+
301+
pending_notifications = list_count_nodes(
302+
group.notification_list.address_of_()
303+
)
304+
print(f"{pending_notifications} notifications are pending.")
305+
print("Tasks waiting for notification:")
306+
print_waitqueue(group.notification_waitq, stack_trace=verbose >= 2)
307+
308+
if group_kind == "fanotify":
309+
resp_cnt = list_count_nodes(
310+
group.fanotify_data.access_list.address_of_()
311+
)
312+
print(f"{resp_cnt} pending permission responses")
313+
print("Tasks waiting for permission response from userspace:")
314+
print_waitqueue(
315+
group.fanotify_data.access_waitq, stack_trace=verbose >= 2
316+
)
317+
elif group_kind in ("inotify", "dnotify"):
318+
pass # nothing special to report
319+
else:
320+
print(f"unknown kind {group_kind}")
321+
322+
323+
def fsnotify_show(prog: Program, verbose: int = 1) -> None:
324+
"""
325+
Print a report of every fsnotify group on the system.
326+
327+
This enumerates all fsnotify and inotify groups, by iterating over each task
328+
& finding relevant files. Each one has a report printed. Finally, the system
329+
dnotify group (there is only one) is printed.
330+
331+
:param verbose: verbosity level (see :func:`fsnotify_group_report()`)
332+
"""
333+
fanotify_ops = prog["fanotify_fops"].address_of_()
334+
inotify_ops = prog["inotify_fops"].address_of_()
335+
group_type = prog.type("struct fsnotify_group *")
336+
seen_groups = set()
337+
for task in for_each_task(prog):
338+
# No point in looking at threads, since file descriptions are shared.
339+
if not is_group_leader(task):
340+
continue
341+
342+
for fd, file in for_each_file(task):
343+
if file and file.f_op == fanotify_ops:
344+
kind = "fanotify"
345+
elif file and file.f_op == inotify_ops:
346+
kind = "inotify"
347+
else:
348+
continue
349+
print(
350+
f"[PID {task.pid.value_()} COMM: {task.comm.string_().decode()} {kind} FD {fd}]"
351+
)
352+
group = cast(group_type, file.private_data)
353+
354+
# Since file descriptors can be shared even across tasks, we need to
355+
# track groups we've already reported and skip re-reporting. This
356+
# reduces the output size and runtime. For example, crond seems to
357+
# share an inotify FD across tasks.
358+
if group.value_() not in seen_groups:
359+
seen_groups.add(group.value_())
360+
fsnotify_group_report(group, kind, verbose=verbose)
361+
else:
362+
print(f"FSNOTIFY GROUP {group.value_():x}: already seen")
363+
print()
364+
if prog["dnotify_group"]:
365+
# dnotify_group can be NULL early in boot. No use crashing if that's the
366+
# case.
367+
print("[SYSTEM DNOTIFY GROUP]")
368+
fsnotify_group_report(
369+
prog["dnotify_group"], "dnotify", verbose=verbose
370+
)
371+
372+
373+
class Fsnotify(CorelensModule):
374+
"""Print details about the fsnotify subsystem"""
375+
376+
name = "fsnotify"
377+
378+
def add_args(self, parser: argparse.ArgumentParser) -> None:
379+
parser.add_argument(
380+
"--verbose",
381+
"-v",
382+
type=int,
383+
default=1,
384+
help="Set verbosity: 0-4 (default 1)",
385+
)
386+
387+
def run(self, prog: Program, args: argparse.Namespace) -> None:
388+
fsnotify_show(prog, verbose=args.verbose)

0 commit comments

Comments
 (0)