Skip to content

Commit 39d6d08

Browse files
beaubelgraverostedt
authored andcommitted
tracing/user_events: Use bits vs bytes for enabled status page data
User processes may require many events and when they do the cache performance of a byte index status check is less ideal than a bit index. The previous event limit per-page was 4096, the new limit is 32,768. This change adds a bitwise index to the user_reg struct. Programs check that the bit at status_bit has a bit set within the status page(s). Link: https://lkml.kernel.org/r/[email protected] Link: https://lore.kernel.org/all/[email protected]/ Suggested-by: Mathieu Desnoyers <[email protected]> Signed-off-by: Beau Belgrave <[email protected]> Signed-off-by: Steven Rostedt (Google) <[email protected]>
1 parent d401b72 commit 39d6d08

File tree

5 files changed

+135
-38
lines changed

5 files changed

+135
-38
lines changed

include/linux/user_events.h

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,6 @@
2020
#define USER_EVENTS_SYSTEM "user_events"
2121
#define USER_EVENTS_PREFIX "u:"
2222

23-
/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
24-
#define EVENT_BIT_FTRACE 0
25-
#define EVENT_BIT_PERF 1
26-
#define EVENT_BIT_OTHER 7
27-
28-
#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
29-
#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
30-
#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
31-
3223
/* Create dynamic location entry within a 32-bit value */
3324
#define DYN_LOC(offset, size) ((size) << 16 | (offset))
3425

@@ -45,12 +36,12 @@ struct user_reg {
4536
/* Input: Pointer to string with event name, description and flags */
4637
__u64 name_args;
4738

48-
/* Output: Byte index of the event within the status page */
49-
__u32 status_index;
39+
/* Output: Bitwise index of the event within the status page */
40+
__u32 status_bit;
5041

5142
/* Output: Index of the event to use when writing data */
5243
__u32 write_index;
53-
};
44+
} __attribute__((__packed__));
5445

5546
#define DIAG_IOC_MAGIC '*'
5647

kernel/trace/trace_events_user.c

Lines changed: 67 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,44 @@
4040
*/
4141
#define MAX_PAGE_ORDER 0
4242
#define MAX_PAGES (1 << MAX_PAGE_ORDER)
43-
#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
43+
#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
44+
#define MAX_EVENTS (MAX_BYTES * 8)
4445

4546
/* Limit how long of an event name plus args within the subsystem. */
4647
#define MAX_EVENT_DESC 512
4748
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
4849
#define MAX_FIELD_ARRAY_SIZE 1024
4950

51+
/*
52+
* The MAP_STATUS_* macros are used for taking a index and determining the
53+
* appropriate byte and the bit in the byte to set/reset for an event.
54+
*
55+
* The lower 3 bits of the index decide which bit to set.
56+
* The remaining upper bits of the index decide which byte to use for the bit.
57+
*
58+
* This is used when an event has a probe attached/removed to reflect live
59+
* status of the event wanting tracing or not to user-programs via shared
60+
* memory maps.
61+
*/
62+
#define MAP_STATUS_BYTE(index) ((index) >> 3)
63+
#define MAP_STATUS_MASK(index) BIT((index) & 7)
64+
65+
/*
66+
* Internal bits (kernel side only) to keep track of connected probes:
67+
* These are used when status is requested in text form about an event. These
68+
* bits are compared against an internal byte on the event to determine which
69+
* probes to print out to the user.
70+
*
71+
* These do not reflect the mapped bytes between the user and kernel space.
72+
*/
73+
#define EVENT_STATUS_FTRACE BIT(0)
74+
#define EVENT_STATUS_PERF BIT(1)
75+
#define EVENT_STATUS_OTHER BIT(7)
76+
5077
static char *register_page_data;
5178

5279
static DEFINE_MUTEX(reg_mutex);
53-
static DEFINE_HASHTABLE(register_table, 4);
80+
static DEFINE_HASHTABLE(register_table, 8);
5481
static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
5582

5683
/*
@@ -72,6 +99,7 @@ struct user_event {
7299
int index;
73100
int flags;
74101
int min_size;
102+
char status;
75103
};
76104

77105
/*
@@ -106,6 +134,22 @@ static u32 user_event_key(char *name)
106134
return jhash(name, strlen(name), 0);
107135
}
108136

137+
static __always_inline
138+
void user_event_register_set(struct user_event *user)
139+
{
140+
int i = user->index;
141+
142+
register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
143+
}
144+
145+
static __always_inline
146+
void user_event_register_clear(struct user_event *user)
147+
{
148+
int i = user->index;
149+
150+
register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
151+
}
152+
109153
static __always_inline __must_check
110154
bool user_event_last_ref(struct user_event *user)
111155
{
@@ -648,7 +692,7 @@ static int destroy_user_event(struct user_event *user)
648692

649693
dyn_event_remove(&user->devent);
650694

651-
register_page_data[user->index] = 0;
695+
user_event_register_clear(user);
652696
clear_bit(user->index, page_bitmap);
653697
hash_del(&user->node);
654698

@@ -827,7 +871,12 @@ static void update_reg_page_for(struct user_event *user)
827871
rcu_read_unlock_sched();
828872
}
829873

830-
register_page_data[user->index] = status;
874+
if (status)
875+
user_event_register_set(user);
876+
else
877+
user_event_register_clear(user);
878+
879+
user->status = status;
831880
}
832881

833882
/*
@@ -1332,7 +1381,17 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
13321381
if (size > PAGE_SIZE)
13331382
return -E2BIG;
13341383

1335-
return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
1384+
if (size < offsetofend(struct user_reg, write_index))
1385+
return -EINVAL;
1386+
1387+
ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
1388+
1389+
if (ret)
1390+
return ret;
1391+
1392+
kreg->size = size;
1393+
1394+
return 0;
13361395
}
13371396

13381397
/*
@@ -1376,7 +1435,7 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
13761435
return ret;
13771436

13781437
put_user((u32)ret, &ureg->write_index);
1379-
put_user(user->index, &ureg->status_index);
1438+
put_user(user->index, &ureg->status_bit);
13801439

13811440
return 0;
13821441
}
@@ -1485,7 +1544,7 @@ static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
14851544
{
14861545
unsigned long size = vma->vm_end - vma->vm_start;
14871546

1488-
if (size != MAX_EVENTS)
1547+
if (size != MAX_BYTES)
14891548
return -EINVAL;
14901549

14911550
return remap_pfn_range(vma, vma->vm_start,
@@ -1520,7 +1579,7 @@ static int user_seq_show(struct seq_file *m, void *p)
15201579
mutex_lock(&reg_mutex);
15211580

15221581
hash_for_each(register_table, i, user, node) {
1523-
status = register_page_data[user->index];
1582+
status = user->status;
15241583
flags = user->flags;
15251584

15261585
seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));

samples/user_events/example.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,21 @@
1212
#include <fcntl.h>
1313
#include <stdio.h>
1414
#include <unistd.h>
15+
#include <asm/bitsperlong.h>
16+
#include <endian.h>
1517
#include <linux/user_events.h>
1618

19+
#if __BITS_PER_LONG == 64
20+
#define endian_swap(x) htole64(x)
21+
#else
22+
#define endian_swap(x) htole32(x)
23+
#endif
24+
1725
/* Assumes debugfs is mounted */
1826
const char *data_file = "/sys/kernel/debug/tracing/user_events_data";
1927
const char *status_file = "/sys/kernel/debug/tracing/user_events_status";
2028

21-
static int event_status(char **status)
29+
static int event_status(long **status)
2230
{
2331
int fd = open(status_file, O_RDONLY);
2432

@@ -33,7 +41,8 @@ static int event_status(char **status)
3341
return 0;
3442
}
3543

36-
static int event_reg(int fd, const char *command, int *status, int *write)
44+
static int event_reg(int fd, const char *command, long *index, long *mask,
45+
int *write)
3746
{
3847
struct user_reg reg = {0};
3948

@@ -43,16 +52,18 @@ static int event_reg(int fd, const char *command, int *status, int *write)
4352
if (ioctl(fd, DIAG_IOCSREG, &reg) == -1)
4453
return -1;
4554

46-
*status = reg.status_index;
55+
*index = reg.status_bit / __BITS_PER_LONG;
56+
*mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG));
4757
*write = reg.write_index;
4858

4959
return 0;
5060
}
5161

5262
int main(int argc, char **argv)
5363
{
54-
int data_fd, status, write;
55-
char *status_page;
64+
int data_fd, write;
65+
long index, mask;
66+
long *status_page;
5667
struct iovec io[2];
5768
__u32 count = 0;
5869

@@ -61,7 +72,7 @@ int main(int argc, char **argv)
6172

6273
data_fd = open(data_file, O_RDWR);
6374

64-
if (event_reg(data_fd, "test u32 count", &status, &write) == -1)
75+
if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1)
6576
return errno;
6677

6778
/* Setup iovec */
@@ -75,7 +86,7 @@ int main(int argc, char **argv)
7586
getchar();
7687

7788
/* Check if anyone is listening */
78-
if (status_page[status]) {
89+
if (status_page[index] & mask) {
7990
/* Yep, trace out our data */
8091
writev(data_fd, (const struct iovec *)io, 2);
8192

tools/testing/selftests/user_events/ftrace_test.c

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e
2222
const char *trace_file = "/sys/kernel/debug/tracing/trace";
2323
const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format";
2424

25+
static inline int status_check(char *status_page, int status_bit)
26+
{
27+
return status_page[status_bit >> 3] & (1 << (status_bit & 7));
28+
}
29+
2530
static int trace_bytes(void)
2631
{
2732
int fd = open(trace_file, O_RDONLY);
@@ -197,12 +202,12 @@ TEST_F(user, register_events) {
197202
/* Register should work */
198203
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
199204
ASSERT_EQ(0, reg.write_index);
200-
ASSERT_NE(0, reg.status_index);
205+
ASSERT_NE(0, reg.status_bit);
201206

202207
/* Multiple registers should result in same index */
203208
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
204209
ASSERT_EQ(0, reg.write_index);
205-
ASSERT_NE(0, reg.status_index);
210+
ASSERT_NE(0, reg.status_bit);
206211

207212
/* Ensure disabled */
208213
self->enable_fd = open(enable_file, O_RDWR);
@@ -212,15 +217,15 @@ TEST_F(user, register_events) {
212217
/* MMAP should work and be zero'd */
213218
ASSERT_NE(MAP_FAILED, status_page);
214219
ASSERT_NE(NULL, status_page);
215-
ASSERT_EQ(0, status_page[reg.status_index]);
220+
ASSERT_EQ(0, status_check(status_page, reg.status_bit));
216221

217222
/* Enable event and ensure bits updated in status */
218223
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
219-
ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]);
224+
ASSERT_NE(0, status_check(status_page, reg.status_bit));
220225

221226
/* Disable event and ensure bits updated in status */
222227
ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0")))
223-
ASSERT_EQ(0, status_page[reg.status_index]);
228+
ASSERT_EQ(0, status_check(status_page, reg.status_bit));
224229

225230
/* File still open should return -EBUSY for delete */
226231
ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event"));
@@ -240,6 +245,8 @@ TEST_F(user, write_events) {
240245
struct iovec io[3];
241246
__u32 field1, field2;
242247
int before = 0, after = 0;
248+
int page_size = sysconf(_SC_PAGESIZE);
249+
char *status_page;
243250

244251
reg.size = sizeof(reg);
245252
reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
@@ -254,10 +261,18 @@ TEST_F(user, write_events) {
254261
io[2].iov_base = &field2;
255262
io[2].iov_len = sizeof(field2);
256263

264+
status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
265+
self->status_fd, 0);
266+
257267
/* Register should work */
258268
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
259269
ASSERT_EQ(0, reg.write_index);
260-
ASSERT_NE(0, reg.status_index);
270+
ASSERT_NE(0, reg.status_bit);
271+
272+
/* MMAP should work and be zero'd */
273+
ASSERT_NE(MAP_FAILED, status_page);
274+
ASSERT_NE(NULL, status_page);
275+
ASSERT_EQ(0, status_check(status_page, reg.status_bit));
261276

262277
/* Write should fail on invalid slot with ENOENT */
263278
io[0].iov_base = &field2;
@@ -271,6 +286,9 @@ TEST_F(user, write_events) {
271286
self->enable_fd = open(enable_file, O_RDWR);
272287
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
273288

289+
/* Event should now be enabled */
290+
ASSERT_NE(0, status_check(status_page, reg.status_bit));
291+
274292
/* Write should make it out to ftrace buffers */
275293
before = trace_bytes();
276294
ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3));
@@ -298,7 +316,7 @@ TEST_F(user, write_fault) {
298316
/* Register should work */
299317
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
300318
ASSERT_EQ(0, reg.write_index);
301-
ASSERT_NE(0, reg.status_index);
319+
ASSERT_NE(0, reg.status_bit);
302320

303321
/* Write should work normally */
304322
ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2));
@@ -315,14 +333,24 @@ TEST_F(user, write_validator) {
315333
int loc, bytes;
316334
char data[8];
317335
int before = 0, after = 0;
336+
int page_size = sysconf(_SC_PAGESIZE);
337+
char *status_page;
338+
339+
status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
340+
self->status_fd, 0);
318341

319342
reg.size = sizeof(reg);
320343
reg.name_args = (__u64)"__test_event __rel_loc char[] data";
321344

322345
/* Register should work */
323346
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
324347
ASSERT_EQ(0, reg.write_index);
325-
ASSERT_NE(0, reg.status_index);
348+
ASSERT_NE(0, reg.status_bit);
349+
350+
/* MMAP should work and be zero'd */
351+
ASSERT_NE(MAP_FAILED, status_page);
352+
ASSERT_NE(NULL, status_page);
353+
ASSERT_EQ(0, status_check(status_page, reg.status_bit));
326354

327355
io[0].iov_base = &reg.write_index;
328356
io[0].iov_len = sizeof(reg.write_index);
@@ -340,6 +368,9 @@ TEST_F(user, write_validator) {
340368
self->enable_fd = open(enable_file, O_RDWR);
341369
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
342370

371+
/* Event should now be enabled */
372+
ASSERT_NE(0, status_check(status_page, reg.status_bit));
373+
343374
/* Full in-bounds write should work */
344375
before = trace_bytes();
345376
loc = DYN_LOC(0, bytes);

0 commit comments

Comments
 (0)