Skip to content

Commit fc29e04

Browse files
netoptimizerhtejun
authored andcommitted
cgroup/rstat: add cgroup_rstat_lock helpers and tracepoints
This commit enhances the ability to troubleshoot the global cgroup_rstat_lock by introducing wrapper helper functions for the lock along with associated tracepoints. Although global, the cgroup_rstat_lock helper APIs and tracepoints take arguments such as cgroup pointer and cpu_in_loop variable. This adjustment is made because flushing occurs per cgroup despite the lock being global. Hence, when troubleshooting, it's important to identify the relevant cgroup. The cpu_in_loop variable is necessary because the global lock may be released within the main flushing loop that traverses CPUs. In the tracepoints, the cpu_in_loop value is set to -1 when acquiring the main lock; otherwise, it denotes the CPU number processed last. The new feature in this patchset is detecting when lock is contended. The tracepoints are implemented with production in mind. For minimum overhead attach to cgroup:cgroup_rstat_lock_contended, which only gets activated when trylock detects lock is contended. A quick production check for issues could be done via this perf commands: perf record -g -e cgroup:cgroup_rstat_lock_contended Next natural question would be asking how long time do lock contenders wait for obtaining the lock. This can be answered by measuring the time between cgroup:cgroup_rstat_lock_contended and cgroup:cgroup_rstat_locked when args->contended is set. Like this bpftrace script: bpftrace -e ' tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs} tracepoint:cgroup:cgroup_rstat_locked { if (args->contended) { @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}} interval:s:1 {time("%H:%M:%S "); print(@wait_ns); }' Extending with time spend holding the lock will be more expensive as this also looks at all the non-contended cases. Like this bpftrace script: bpftrace -e ' tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs} tracepoint:cgroup:cgroup_rstat_locked { @locked[tid]=nsecs; if (args->contended) { @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}} tracepoint:cgroup:cgroup_rstat_unlock { @locked_ns=hist(nsecs-@locked[tid]); delete(@locked[tid]);} interval:s:1 {time("%H:%M:%S "); print(@wait_ns);print(@locked_ns); }' Signed-off-by: Jesper Dangaard Brouer <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent 15b8b9a commit fc29e04

File tree

3 files changed

+88
-9
lines changed

3 files changed

+88
-9
lines changed

include/linux/cgroup.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
690690
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
691691
void cgroup_rstat_flush(struct cgroup *cgrp);
692692
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
693-
void cgroup_rstat_flush_release(void);
693+
void cgroup_rstat_flush_release(struct cgroup *cgrp);
694694

695695
/*
696696
* Basic resource stats.

include/trace/events/cgroup.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,54 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
204204
TP_ARGS(cgrp, path, val)
205205
);
206206

207+
DECLARE_EVENT_CLASS(cgroup_rstat,
208+
209+
TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
210+
211+
TP_ARGS(cgrp, cpu_in_loop, contended),
212+
213+
TP_STRUCT__entry(
214+
__field( int, root )
215+
__field( int, level )
216+
__field( u64, id )
217+
__field( int, cpu_in_loop )
218+
__field( bool, contended )
219+
),
220+
221+
TP_fast_assign(
222+
__entry->root = cgrp->root->hierarchy_id;
223+
__entry->id = cgroup_id(cgrp);
224+
__entry->level = cgrp->level;
225+
__entry->cpu_in_loop = cpu_in_loop;
226+
__entry->contended = contended;
227+
),
228+
229+
TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
230+
__entry->root, __entry->id, __entry->level,
231+
__entry->cpu_in_loop, __entry->contended)
232+
);
233+
234+
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
235+
236+
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
237+
238+
TP_ARGS(cgrp, cpu, contended)
239+
);
240+
241+
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
242+
243+
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
244+
245+
TP_ARGS(cgrp, cpu, contended)
246+
);
247+
248+
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
249+
250+
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
251+
252+
TP_ARGS(cgrp, cpu, contended)
253+
);
254+
207255
#endif /* _TRACE_CGROUP_H */
208256

209257
/* This part must be outside protection */

kernel/cgroup/rstat.c

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include <linux/btf.h>
88
#include <linux/btf_ids.h>
99

10+
#include <trace/events/cgroup.h>
11+
1012
static DEFINE_SPINLOCK(cgroup_rstat_lock);
1113
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
1214

@@ -222,6 +224,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
222224

223225
__bpf_hook_end();
224226

227+
/*
228+
* Helper functions for locking cgroup_rstat_lock.
229+
*
230+
* This makes it easier to diagnose locking issues and contention in
231+
* production environments. The parameter @cpu_in_loop indicate lock
232+
* was released and re-taken when collection data from the CPUs. The
233+
* value -1 is used when obtaining the main lock else this is the CPU
234+
* number processed last.
235+
*/
236+
static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
237+
__acquires(&cgroup_rstat_lock)
238+
{
239+
bool contended;
240+
241+
contended = !spin_trylock_irq(&cgroup_rstat_lock);
242+
if (contended) {
243+
trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
244+
spin_lock_irq(&cgroup_rstat_lock);
245+
}
246+
trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
247+
}
248+
249+
static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
250+
__releases(&cgroup_rstat_lock)
251+
{
252+
trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
253+
spin_unlock_irq(&cgroup_rstat_lock);
254+
}
255+
225256
/* see cgroup_rstat_flush() */
226257
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
227258
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -248,10 +279,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
248279

249280
/* play nice and yield if necessary */
250281
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
251-
spin_unlock_irq(&cgroup_rstat_lock);
282+
__cgroup_rstat_unlock(cgrp, cpu);
252283
if (!cond_resched())
253284
cpu_relax();
254-
spin_lock_irq(&cgroup_rstat_lock);
285+
__cgroup_rstat_lock(cgrp, cpu);
255286
}
256287
}
257288
}
@@ -273,9 +304,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
273304
{
274305
might_sleep();
275306

276-
spin_lock_irq(&cgroup_rstat_lock);
307+
__cgroup_rstat_lock(cgrp, -1);
277308
cgroup_rstat_flush_locked(cgrp);
278-
spin_unlock_irq(&cgroup_rstat_lock);
309+
__cgroup_rstat_unlock(cgrp, -1);
279310
}
280311

281312
/**
@@ -291,17 +322,17 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
291322
__acquires(&cgroup_rstat_lock)
292323
{
293324
might_sleep();
294-
spin_lock_irq(&cgroup_rstat_lock);
325+
__cgroup_rstat_lock(cgrp, -1);
295326
cgroup_rstat_flush_locked(cgrp);
296327
}
297328

298329
/**
299330
* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
300331
*/
301-
void cgroup_rstat_flush_release(void)
332+
void cgroup_rstat_flush_release(struct cgroup *cgrp)
302333
__releases(&cgroup_rstat_lock)
303334
{
304-
spin_unlock_irq(&cgroup_rstat_lock);
335+
__cgroup_rstat_unlock(cgrp, -1);
305336
}
306337

307338
int cgroup_rstat_init(struct cgroup *cgrp)
@@ -533,7 +564,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
533564
#ifdef CONFIG_SCHED_CORE
534565
forceidle_time = cgrp->bstat.forceidle_sum;
535566
#endif
536-
cgroup_rstat_flush_release();
567+
cgroup_rstat_flush_release(cgrp);
537568
} else {
538569
root_cgroup_cputime(&bstat);
539570
usage = bstat.cputime.sum_exec_runtime;

0 commit comments

Comments
 (0)