1
1
// SPDX-License-Identifier: GPL-2.0
2
2
/*
3
- * delaytop.c - task delay monitoring tool.
3
+ * delaytop.c - system-wide delay monitoring tool.
4
4
*
5
5
* This tool provides real-time monitoring and statistics of
6
6
* system, container, and task-level delays, including CPU,
7
- * memory, IO, and IRQ and delay accounting. It supports both
8
- * interactive (top-like), and can output delay information
9
- * for the whole system, specific containers (cgroups), or
10
- * individual tasks (PIDs).
7
+ * memory, IO, and IRQ. It supports both interactive (top-like),
8
+ * and can output delay information for the whole system, specific
9
+ * containers (cgroups), or individual tasks (PIDs).
11
10
*
12
11
* Key features:
13
12
* - Collects per-task delay accounting statistics via taskstats.
13
+ * - Collects system-wide PSI information.
14
14
* - Supports sorting, filtering.
15
15
* - Supports both interactive (screen refresh).
16
16
*
32
32
#include <time.h>
33
33
#include <dirent.h>
34
34
#include <ctype.h>
35
+ #include <stdbool.h>
35
36
#include <sys/types.h>
36
37
#include <sys/stat.h>
37
38
#include <sys/socket.h>
41
42
#include <linux/genetlink.h>
42
43
#include <linux/taskstats.h>
43
44
#include <linux/cgroupstats.h>
44
- #include <ncurses.h>
45
45
46
46
#define PSI_CPU_SOME "/proc/pressure/cpu"
47
47
#define PSI_CPU_FULL "/proc/pressure/cpu"
62
62
#define MAX_MSG_SIZE 1024
63
63
#define MAX_TASKS 1000
64
64
#define SET_TASK_STAT (task_count , field ) tasks[task_count].field = stats.field
65
+ #define BOOL_FPRINT (stream , fmt , ...) \
66
+ ({ \
67
+ int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
68
+ ret >= 0; \
69
+ })
70
+ #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
65
71
66
72
/* Program settings structure */
67
73
struct config {
@@ -262,6 +268,7 @@ static int create_nl_socket(void)
262
268
local .nl_family = AF_NETLINK ;
263
269
264
270
if (bind (fd , (struct sockaddr * ) & local , sizeof (local )) < 0 ) {
271
+ fprintf (stderr , "Failed to bind socket when create nl_socket\n" );
265
272
close (fd );
266
273
return -1 ;
267
274
}
@@ -332,13 +339,17 @@ static int get_family_id(int sd)
332
339
rc = send_cmd (sd , GENL_ID_CTRL , getpid (), CTRL_CMD_GETFAMILY ,
333
340
CTRL_ATTR_FAMILY_NAME , (void * )name ,
334
341
strlen (TASKSTATS_GENL_NAME )+ 1 );
335
- if (rc < 0 )
342
+ if (rc < 0 ) {
343
+ fprintf (stderr , "Failed to send cmd for family id\n" );
336
344
return 0 ;
345
+ }
337
346
338
347
rep_len = recv (sd , & ans , sizeof (ans ), 0 );
339
348
if (ans .n .nlmsg_type == NLMSG_ERROR ||
340
- (rep_len < 0 ) || !NLMSG_OK ((& ans .n ), rep_len ))
349
+ (rep_len < 0 ) || !NLMSG_OK ((& ans .n ), rep_len )) {
350
+ fprintf (stderr , "Failed to receive response for family id\n" );
341
351
return 0 ;
352
+ }
342
353
343
354
na = (struct nlattr * ) GENLMSG_DATA (& ans );
344
355
na = (struct nlattr * ) ((char * ) na + NLA_ALIGN (na -> nla_len ));
@@ -433,26 +444,30 @@ static void read_psi_stats(void)
433
444
static int read_comm (int pid , char * comm_buf , size_t buf_size )
434
445
{
435
446
char path [64 ];
447
+ int ret = -1 ;
436
448
size_t len ;
437
449
FILE * fp ;
438
450
439
451
snprintf (path , sizeof (path ), "/proc/%d/comm" , pid );
440
452
fp = fopen (path , "r" );
441
- if (!fp )
442
- return -1 ;
453
+ if (!fp ) {
454
+ fprintf (stderr , "Failed to open comm file /proc/%d/comm\n" , pid );
455
+ return ret ;
456
+ }
457
+
443
458
if (fgets (comm_buf , buf_size , fp )) {
444
459
len = strlen (comm_buf );
445
460
if (len > 0 && comm_buf [len - 1 ] == '\n' )
446
461
comm_buf [len - 1 ] = '\0' ;
447
- } else {
448
- fclose (fp );
449
- return -1 ;
462
+ ret = 0 ;
450
463
}
464
+
451
465
fclose (fp );
452
- return 0 ;
466
+
467
+ return ret ;
453
468
}
454
469
455
- static int fetch_and_fill_task_info (int pid , const char * comm )
470
+ static void fetch_and_fill_task_info (int pid , const char * comm )
456
471
{
457
472
struct {
458
473
struct nlmsghdr n ;
@@ -466,13 +481,21 @@ static int fetch_and_fill_task_info(int pid, const char *comm)
466
481
int nl_len ;
467
482
int rc ;
468
483
484
+ /* Send request for task stats */
469
485
if (send_cmd (nl_sd , family_id , getpid (), TASKSTATS_CMD_GET ,
470
486
TASKSTATS_CMD_ATTR_PID , & pid , sizeof (pid )) < 0 ) {
471
- return -1 ;
487
+ fprintf (stderr , "Failed to send request for task stats\n" );
488
+ return ;
472
489
}
490
+
491
+ /* Receive response */
473
492
rc = recv (nl_sd , & resp , sizeof (resp ), 0 );
474
- if (rc < 0 || resp .n .nlmsg_type == NLMSG_ERROR )
475
- return -1 ;
493
+ if (rc < 0 || resp .n .nlmsg_type == NLMSG_ERROR ) {
494
+ fprintf (stderr , "Failed to receive response for task stats\n" );
495
+ return ;
496
+ }
497
+
498
+ /* Parse response */
476
499
nl_len = GENLMSG_PAYLOAD (& resp .n );
477
500
na = (struct nlattr * ) GENLMSG_DATA (& resp );
478
501
while (nl_len > 0 ) {
@@ -515,7 +538,7 @@ static int fetch_and_fill_task_info(int pid, const char *comm)
515
538
nl_len -= NLA_ALIGN (na -> nla_len );
516
539
na = NLA_NEXT (na );
517
540
}
518
- return 0 ;
541
+ return ;
519
542
}
520
543
521
544
static void get_task_delays (void )
@@ -654,54 +677,82 @@ static void display_results(void)
654
677
{
655
678
time_t now = time (NULL );
656
679
struct tm * tm_now = localtime (& now );
680
+ FILE * out = stdout ;
657
681
char timestamp [32 ];
682
+ bool suc = true;
658
683
int i , count ;
659
- FILE * out = stdout ;
660
684
661
- fprintf (out , "\033[H\033[J" );
685
+ /* Clear terminal screen */
686
+ suc &= BOOL_FPRINT (out , "\033[H\033[J" );
687
+
662
688
/* PSI output (one-line, no cat style) */
663
- fprintf (out , "System Pressure Information: " );
664
- fprintf (out , "(avg10/avg60/avg300/total)\n" );
665
- fprintf (out , "CPU:" );
666
- fprintf (out , " full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu" , psi .cpu_full_avg10 ,
667
- psi .cpu_full_avg60 , psi .cpu_full_avg300 , psi .cpu_full_total );
668
- fprintf (out , " some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n" , psi .cpu_some_avg10 ,
669
- psi .cpu_some_avg60 , psi .cpu_some_avg300 , psi .cpu_some_total );
670
-
671
- fprintf (out , "Memory:" );
672
- fprintf (out , " full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu" , psi .memory_full_avg10 ,
673
- psi .memory_full_avg60 , psi .memory_full_avg300 , psi .memory_full_total );
674
- fprintf (out , " some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n" , psi .memory_some_avg10 ,
675
- psi .memory_some_avg60 , psi .memory_some_avg300 , psi .memory_some_total );
676
-
677
- fprintf (out , "IO:" );
678
- fprintf (out , " full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu" , psi .io_full_avg10 ,
679
- psi .io_full_avg60 , psi .io_full_avg300 , psi .io_full_total );
680
- fprintf (out , " some: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n" , psi .io_some_avg10 ,
681
- psi .io_some_avg60 , psi .io_some_avg300 , psi .io_some_total );
682
- fprintf (out , "IRQ:" );
683
- fprintf (out , " full: %6.1f%%/%6.1f%%/%6.1f%%/%-10llu\n\n" , psi .irq_full_avg10 ,
684
- psi .irq_full_avg60 , psi .irq_full_avg300 , psi .irq_full_total );
689
+ suc &= BOOL_FPRINT (out , "System Pressure Information: (avg10/avg60/avg300/total)\n" );
690
+ suc &= BOOL_FPRINT (out , PSI_LINE_FORMAT ,
691
+ "CPU some:" ,
692
+ psi .cpu_some_avg10 ,
693
+ psi .cpu_some_avg60 ,
694
+ psi .cpu_some_avg300 ,
695
+ psi .cpu_some_total / 1000 );
696
+ suc &= BOOL_FPRINT (out , PSI_LINE_FORMAT ,
697
+ "CPU full:" ,
698
+ psi .cpu_full_avg10 ,
699
+ psi .cpu_full_avg60 ,
700
+ psi .cpu_full_avg300 ,
701
+ psi .cpu_full_total / 1000 );
702
+ suc &= BOOL_FPRINT (out , PSI_LINE_FORMAT ,
703
+ "Memory full:" ,
704
+ psi .memory_full_avg10 ,
705
+ psi .memory_full_avg60 ,
706
+ psi .memory_full_avg300 ,
707
+ psi .memory_full_total / 1000 );
708
+ suc &= BOOL_FPRINT (out , PSI_LINE_FORMAT ,
709
+ "Memory some:" ,
710
+ psi .memory_some_avg10 ,
711
+ psi .memory_some_avg60 ,
712
+ psi .memory_some_avg300 ,
713
+ psi .memory_some_total / 1000 );
714
+ suc &= BOOL_FPRINT (out , PSI_LINE_FORMAT ,
715
+ "IO full:" ,
716
+ psi .io_full_avg10 ,
717
+ psi .io_full_avg60 ,
718
+ psi .io_full_avg300 ,
719
+ psi .io_full_total / 1000 );
720
+ suc &= BOOL_FPRINT (out , PSI_LINE_FORMAT ,
721
+ "IO some:" ,
722
+ psi .io_some_avg10 ,
723
+ psi .io_some_avg60 ,
724
+ psi .io_some_avg300 ,
725
+ psi .io_some_total / 1000 );
726
+ suc &= BOOL_FPRINT (out , PSI_LINE_FORMAT ,
727
+ "IRQ full:" ,
728
+ psi .irq_full_avg10 ,
729
+ psi .irq_full_avg60 ,
730
+ psi .irq_full_avg300 ,
731
+ psi .irq_full_total / 1000 );
732
+
685
733
if (cfg .container_path ) {
686
- fprintf (out , "Container Information (%s):\n" , cfg .container_path );
687
- fprintf (out , "Processes: running=%d, sleeping=%d, " ,
734
+ suc &= BOOL_FPRINT (out , "Container Information (%s):\n" , cfg .container_path );
735
+ suc &= BOOL_FPRINT (out , "Processes: running=%d, sleeping=%d, " ,
688
736
container_stats .nr_running , container_stats .nr_sleeping );
689
- fprintf (out , "stopped=%d, uninterruptible=%d, io_wait=%d\n\n" ,
737
+ suc &= BOOL_FPRINT (out , "stopped=%d, uninterruptible=%d, io_wait=%d\n\n" ,
690
738
container_stats .nr_stopped , container_stats .nr_uninterruptible ,
691
739
container_stats .nr_io_wait );
692
740
}
693
- fprintf (out , "Top %d processes (sorted by CPU delay):\n \n" ,
741
+ suc &= BOOL_FPRINT (out , "Top %d processes (sorted by CPU delay):\n" ,
694
742
cfg .max_processes );
695
- fprintf (out , " PID TGID COMMAND CPU(ms) IO(ms) " );
696
- fprintf (out , "SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms)\n" );
697
- fprintf (out , "-----------------------------------------------" );
698
- fprintf (out , "----------------------------------------------\n" );
743
+ suc &= BOOL_FPRINT (out , "%5s %5s %-17s" , "PID" , "TGID" , "COMMAND" );
744
+ suc &= BOOL_FPRINT (out , "%7s %7s %7s %7s %7s %7s %7s %7s\n" ,
745
+ "CPU(ms)" , "IO(ms)" , "SWAP(ms)" , "RCL(ms)" ,
746
+ "THR(ms)" , "CMP(ms)" , "WP(ms)" , "IRQ(ms)" );
747
+
748
+ suc &= BOOL_FPRINT (out , "-----------------------------------------------" );
749
+ suc &= BOOL_FPRINT (out , "----------------------------------------------\n" );
699
750
count = task_count < cfg .max_processes ? task_count : cfg .max_processes ;
700
751
701
752
for (i = 0 ; i < count ; i ++ ) {
702
- fprintf (out , "%5d %5d %-15s " ,
753
+ suc &= BOOL_FPRINT (out , "%5d %5d %-15s" ,
703
754
tasks [i ].pid , tasks [i ].tgid , tasks [i ].command );
704
- fprintf (out , "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n" ,
755
+ suc &= BOOL_FPRINT (out , "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n" ,
705
756
average_ms (tasks [i ].cpu_delay_total , tasks [i ].cpu_count ),
706
757
average_ms (tasks [i ].blkio_delay_total , tasks [i ].blkio_count ),
707
758
average_ms (tasks [i ].swapin_delay_total , tasks [i ].swapin_count ),
@@ -712,7 +763,10 @@ static void display_results(void)
712
763
average_ms (tasks [i ].irq_delay_total , tasks [i ].irq_count ));
713
764
}
714
765
715
- fprintf (out , "\n" );
766
+ suc &= BOOL_FPRINT (out , "\n" );
767
+
768
+ if (!suc )
769
+ perror ("Error writing to output" );
716
770
}
717
771
718
772
/* Main function */
0 commit comments