Skip to content

Commit 7aa7d50

Browse files
Merge patch series "riscv: Allow userspace to directly access perf counters"
Alexandre Ghiti <[email protected]> says: riscv used to allow direct access to cycle/time/instret counters, bypassing the perf framework, this patchset intends to allow the user to mmap any counter when accessed through perf. **Important**: The default mode is now user access through perf only, not the legacy so some applications will break. However, we introduce a sysctl perf_user_access like arm64 does, which will allow to switch to the legacy mode described above. This version needs openSBI v1.3 *and* a kernel fix that went upstream lately (https://lore.kernel.org/lkml/[email protected]/T/). * b4-shazam-merge: perf: tests: Adapt mmap-basic.c for riscv tools: lib: perf: Implement riscv mmap support Documentation: admin-guide: Add riscv sysctl_perf_user_access drivers: perf: Implement perf event mmap support in the SBI backend drivers: perf: Implement perf event mmap support in the legacy backend riscv: Prepare for user-space perf event mmap support drivers: perf: Rename riscv pmu sbi driver riscv: Make legacy counter enum match the HW numbering include: riscv: Fix wrong include guard in riscv_pmu.h perf: Fix wrong comment about default event_idx Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Palmer Dabbelt <[email protected]>
2 parents 9f944d2 + 26ba042 commit 7aa7d50

File tree

8 files changed

+431
-20
lines changed

8 files changed

+431
-20
lines changed

Documentation/admin-guide/sysctl/kernel.rst

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -941,16 +941,35 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
941941
The default value is 8.
942942

943943

944-
perf_user_access (arm64 only)
945-
=================================
944+
perf_user_access (arm64 and riscv only)
945+
=======================================
946+
947+
Controls user space access for reading perf event counters.
946948

947-
Controls user space access for reading perf event counters. When set to 1,
948-
user space can read performance monitor counter registers directly.
949+
arm64
950+
=====
949951

950952
The default value is 0 (access disabled).
951953

954+
When set to 1, user space can read performance monitor counter registers
955+
directly.
956+
952957
See Documentation/arch/arm64/perf.rst for more information.
953958

959+
riscv
960+
=====
961+
962+
When set to 0, user space access is disabled.
963+
964+
The default value is 1, user space can read performance monitor counter
965+
registers through perf, any direct access without perf intervention will trigger
966+
an illegal instruction.
967+
968+
When set to 2, which enables legacy mode (user space has direct access to cycle
969+
and insret CSRs only). Note that this legacy value is deprecated and will be
970+
removed once all user space applications are fixed.
971+
972+
Note that the time CSR is always directly accessible to all modes.
954973

955974
pid_max
956975
=======

drivers/perf/riscv_pmu.c

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,81 @@
1414
#include <linux/perf/riscv_pmu.h>
1515
#include <linux/printk.h>
1616
#include <linux/smp.h>
17+
#include <linux/sched_clock.h>
1718

1819
#include <asm/sbi.h>
1920

21+
static bool riscv_perf_user_access(struct perf_event *event)
22+
{
23+
return ((event->attr.type == PERF_TYPE_HARDWARE) ||
24+
(event->attr.type == PERF_TYPE_HW_CACHE) ||
25+
(event->attr.type == PERF_TYPE_RAW)) &&
26+
!!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
27+
}
28+
29+
void arch_perf_update_userpage(struct perf_event *event,
30+
struct perf_event_mmap_page *userpg, u64 now)
31+
{
32+
struct clock_read_data *rd;
33+
unsigned int seq;
34+
u64 ns;
35+
36+
userpg->cap_user_time = 0;
37+
userpg->cap_user_time_zero = 0;
38+
userpg->cap_user_time_short = 0;
39+
userpg->cap_user_rdpmc = riscv_perf_user_access(event);
40+
41+
#ifdef CONFIG_RISCV_PMU
42+
/*
43+
* The counters are 64-bit but the priv spec doesn't mandate all the
44+
* bits to be implemented: that's why, counter width can vary based on
45+
* the cpu vendor.
46+
*/
47+
if (userpg->cap_user_rdpmc)
48+
userpg->pmc_width = to_riscv_pmu(event->pmu)->ctr_get_width(event->hw.idx) + 1;
49+
#endif
50+
51+
do {
52+
rd = sched_clock_read_begin(&seq);
53+
54+
userpg->time_mult = rd->mult;
55+
userpg->time_shift = rd->shift;
56+
userpg->time_zero = rd->epoch_ns;
57+
userpg->time_cycles = rd->epoch_cyc;
58+
userpg->time_mask = rd->sched_clock_mask;
59+
60+
/*
61+
* Subtract the cycle base, such that software that
62+
* doesn't know about cap_user_time_short still 'works'
63+
* assuming no wraps.
64+
*/
65+
ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
66+
userpg->time_zero -= ns;
67+
68+
} while (sched_clock_read_retry(seq));
69+
70+
userpg->time_offset = userpg->time_zero - now;
71+
72+
/*
73+
* time_shift is not expected to be greater than 31 due to
74+
* the original published conversion algorithm shifting a
75+
* 32-bit value (now specifies a 64-bit value) - refer
76+
* perf_event_mmap_page documentation in perf_event.h.
77+
*/
78+
if (userpg->time_shift == 32) {
79+
userpg->time_shift = 31;
80+
userpg->time_mult >>= 1;
81+
}
82+
83+
/*
84+
* Internal timekeeping for enabled/running/stopped times
85+
* is always computed with the sched_clock.
86+
*/
87+
userpg->cap_user_time = 1;
88+
userpg->cap_user_time_zero = 1;
89+
userpg->cap_user_time_short = 1;
90+
}
91+
2092
static unsigned long csr_read_num(int csr_num)
2193
{
2294
#define switchcase_csr_read(__csr_num, __val) {\
@@ -171,6 +243,8 @@ int riscv_pmu_event_set_period(struct perf_event *event)
171243

172244
local64_set(&hwc->prev_count, (u64)-left);
173245

246+
perf_event_update_userpage(event);
247+
174248
return overflow;
175249
}
176250

@@ -267,6 +341,9 @@ static int riscv_pmu_event_init(struct perf_event *event)
267341
hwc->idx = -1;
268342
hwc->event_base = mapped_event;
269343

344+
if (rvpmu->event_init)
345+
rvpmu->event_init(event);
346+
270347
if (!is_sampling_event(event)) {
271348
/*
272349
* For non-sampling runs, limit the sample_period to half
@@ -283,6 +360,39 @@ static int riscv_pmu_event_init(struct perf_event *event)
283360
return 0;
284361
}
285362

363+
static int riscv_pmu_event_idx(struct perf_event *event)
364+
{
365+
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
366+
367+
if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
368+
return 0;
369+
370+
if (rvpmu->csr_index)
371+
return rvpmu->csr_index(event) + 1;
372+
373+
return 0;
374+
}
375+
376+
static void riscv_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
377+
{
378+
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
379+
380+
if (rvpmu->event_mapped) {
381+
rvpmu->event_mapped(event, mm);
382+
perf_event_update_userpage(event);
383+
}
384+
}
385+
386+
static void riscv_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
387+
{
388+
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
389+
390+
if (rvpmu->event_unmapped) {
391+
rvpmu->event_unmapped(event, mm);
392+
perf_event_update_userpage(event);
393+
}
394+
}
395+
286396
struct riscv_pmu *riscv_pmu_alloc(void)
287397
{
288398
struct riscv_pmu *pmu;
@@ -307,6 +417,9 @@ struct riscv_pmu *riscv_pmu_alloc(void)
307417
}
308418
pmu->pmu = (struct pmu) {
309419
.event_init = riscv_pmu_event_init,
420+
.event_mapped = riscv_pmu_event_mapped,
421+
.event_unmapped = riscv_pmu_event_unmapped,
422+
.event_idx = riscv_pmu_event_idx,
310423
.add = riscv_pmu_add,
311424
.del = riscv_pmu_del,
312425
.start = riscv_pmu_start,

drivers/perf/riscv_pmu_legacy.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include <linux/platform_device.h>
1414

1515
#define RISCV_PMU_LEGACY_CYCLE 0
16-
#define RISCV_PMU_LEGACY_INSTRET 1
16+
#define RISCV_PMU_LEGACY_INSTRET 2
1717

1818
static bool pmu_init_done;
1919

@@ -71,6 +71,29 @@ static void pmu_legacy_ctr_start(struct perf_event *event, u64 ival)
7171
local64_set(&hwc->prev_count, initial_val);
7272
}
7373

74+
static uint8_t pmu_legacy_csr_index(struct perf_event *event)
75+
{
76+
return event->hw.idx;
77+
}
78+
79+
static void pmu_legacy_event_mapped(struct perf_event *event, struct mm_struct *mm)
80+
{
81+
if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES &&
82+
event->attr.config != PERF_COUNT_HW_INSTRUCTIONS)
83+
return;
84+
85+
event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
86+
}
87+
88+
static void pmu_legacy_event_unmapped(struct perf_event *event, struct mm_struct *mm)
89+
{
90+
if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES &&
91+
event->attr.config != PERF_COUNT_HW_INSTRUCTIONS)
92+
return;
93+
94+
event->hw.flags &= ~PERF_EVENT_FLAG_USER_READ_CNT;
95+
}
96+
7497
/*
7598
* This is just a simple implementation to allow legacy implementations
7699
* compatible with new RISC-V PMU driver framework.
@@ -91,6 +114,9 @@ static void pmu_legacy_init(struct riscv_pmu *pmu)
91114
pmu->ctr_get_width = NULL;
92115
pmu->ctr_clear_idx = NULL;
93116
pmu->ctr_read = pmu_legacy_read_ctr;
117+
pmu->event_mapped = pmu_legacy_event_mapped;
118+
pmu->event_unmapped = pmu_legacy_event_unmapped;
119+
pmu->csr_index = pmu_legacy_csr_index;
94120

95121
perf_pmu_register(&pmu->pmu, "cpu", PERF_TYPE_RAW);
96122
}

0 commit comments

Comments
 (0)