Skip to content

Commit 7e340f4

Browse files
Merge patch series "Svvptc extension to remove preventive sfence.vma"
Alexandre Ghiti <[email protected]> says: In RISC-V, after a new mapping is established, a sfence.vma needs to be emitted for different reasons: - if the uarch caches invalid entries, we need to invalidate it otherwise we would trap on this invalid entry, - if the uarch does not cache invalid entries, a reordered access could fail to see the new mapping and then trap (sfence.vma acts as a fence). We can actually avoid emitting those (mostly) useless and costly sfence.vma by handling the traps instead: - for new kernel mappings: only vmalloc mappings need to be taken care of, other new mapping are rare and already emit the required sfence.vma if needed. That must be achieved very early in the exception path as explained in patch 3, and this also fixes our fragile way of dealing with vmalloc faults. - for new user mappings: Svvptc makes update_mmu_cache() a no-op but we can take some gratuitous page faults (which are very unlikely though). Patch 1 and 2 introduce Svvptc extension probing. On our uarch that does not cache invalid entries and a 6.5 kernel, the gains are measurable: * Kernel boot: 6% * ltp - mmapstress01: 8% * lmbench - lat_pagefault: 20% * lmbench - lat_mmap: 5% Here are the corresponding numbers of sfence.vma emitted: * Ubuntu boot to login: Before: ~630k sfence.vma After: ~200k sfence.vma * ltp - mmapstress01 Before: ~45k After: ~6.3k * lmbench - lat_pagefault Before: ~665k After: 832 (!) * lmbench - lat_mmap Before: ~546k After: 718 (!) Thanks to Ved and Matt Evans for triggering the discussion that led to this patchset! * b4-shazam-merge: riscv: Stop emitting preventive sfence.vma for new userspace mappings with Svvptc riscv: Stop emitting preventive sfence.vma for new vmalloc mappings dt-bindings: riscv: Add Svvptc ISA extension description riscv: Add ISA extension parsing for Svvptc Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Palmer Dabbelt <[email protected]>
2 parents 1845d38 + 7a21b2e commit 7e340f4

File tree

10 files changed

+152
-1
lines changed

10 files changed

+152
-1
lines changed

Documentation/devicetree/bindings/riscv/extensions.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,13 @@ properties:
171171
memory types as ratified in the 20191213 version of the privileged
172172
ISA specification.
173173

174+
- const: svvptc
175+
description:
176+
The standard Svvptc supervisor-level extension for
177+
address-translation cache behaviour with respect to invalid entries
178+
as ratified at commit 4a69197e5617 ("Update to ratified state") of
179+
riscv-svvptc.
180+
174181
- const: zacas
175182
description: |
176183
The Zacas extension for Atomic Compare-and-Swap (CAS) instructions

arch/riscv/include/asm/cacheflush.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,23 @@ do { \
4646
} while (0)
4747

4848
#ifdef CONFIG_64BIT
49-
#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
49+
extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
50+
extern char _end[];
51+
#define flush_cache_vmap flush_cache_vmap
52+
static inline void flush_cache_vmap(unsigned long start, unsigned long end)
53+
{
54+
if (is_vmalloc_or_module_addr((void *)start)) {
55+
int i;
56+
57+
/*
58+
* We don't care if concurrently a cpu resets this value since
59+
* the only place this can happen is in handle_exception() where
60+
* an sfence.vma is emitted.
61+
*/
62+
for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i)
63+
new_vmalloc[i] = -1ULL;
64+
}
65+
}
5066
#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end)
5167
#endif
5268

arch/riscv/include/asm/hwcap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
#define RISCV_ISA_EXT_ZCF 83
9393
#define RISCV_ISA_EXT_ZCMOP 84
9494
#define RISCV_ISA_EXT_ZAWRS 85
95+
#define RISCV_ISA_EXT_SVVPTC 86
9596

9697
#define RISCV_ISA_EXT_XLINUXENVCFG 127
9798

arch/riscv/include/asm/pgtable.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,9 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
497497
struct vm_area_struct *vma, unsigned long address,
498498
pte_t *ptep, unsigned int nr)
499499
{
500+
asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1)
501+
: : : : svvptc);
502+
500503
/*
501504
* The kernel assumes that TLBs don't cache invalid entries, but
502505
* in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
@@ -506,6 +509,13 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
506509
*/
507510
while (nr--)
508511
local_flush_tlb_page(address + nr * PAGE_SIZE);
512+
513+
svvptc:;
514+
/*
515+
* Svvptc guarantees that the new valid pte will be visible within
516+
* a bounded timeframe, so when the uarch does not cache invalid
517+
* entries, we don't have to do anything.
518+
*/
509519
}
510520
#define update_mmu_cache(vma, addr, ptep) \
511521
update_mmu_cache_range(NULL, vma, addr, ptep, 1)

arch/riscv/include/asm/thread_info.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ struct thread_info {
6161
void *scs_base;
6262
void *scs_sp;
6363
#endif
64+
#ifdef CONFIG_64BIT
65+
/*
66+
* Used in handle_exception() to save a0, a1 and a2 before knowing if we
67+
* can access the kernel stack.
68+
*/
69+
unsigned long a0, a1, a2;
70+
#endif
6471
};
6572

6673
#ifdef CONFIG_SHADOW_CALL_STACK

arch/riscv/kernel/asm-offsets.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,20 @@ void asm_offsets(void)
3636
OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
3737
OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
3838
OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
39+
40+
OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
3941
OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
4042
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
4143
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
4244
OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp);
4345
#ifdef CONFIG_SHADOW_CALL_STACK
4446
OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp);
4547
#endif
48+
#ifdef CONFIG_64BIT
49+
OFFSET(TASK_TI_A0, task_struct, thread_info.a0);
50+
OFFSET(TASK_TI_A1, task_struct, thread_info.a1);
51+
OFFSET(TASK_TI_A2, task_struct, thread_info.a2);
52+
#endif
4653

4754
OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
4855
OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);

arch/riscv/kernel/cpufeature.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
381381
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
382382
__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
383383
__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
384+
__RISCV_ISA_EXT_DATA(svvptc, RISCV_ISA_EXT_SVVPTC),
384385
};
385386

386387
const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext);

arch/riscv/kernel/entry.S

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,79 @@
1919

2020
.section .irqentry.text, "ax"
2121

22+
.macro new_vmalloc_check
23+
REG_S a0, TASK_TI_A0(tp)
24+
csrr a0, CSR_CAUSE
25+
/* Exclude IRQs */
26+
blt a0, zero, _new_vmalloc_restore_context_a0
27+
28+
REG_S a1, TASK_TI_A1(tp)
29+
/* Only check new_vmalloc if we are in page/protection fault */
30+
li a1, EXC_LOAD_PAGE_FAULT
31+
beq a0, a1, _new_vmalloc_kernel_address
32+
li a1, EXC_STORE_PAGE_FAULT
33+
beq a0, a1, _new_vmalloc_kernel_address
34+
li a1, EXC_INST_PAGE_FAULT
35+
bne a0, a1, _new_vmalloc_restore_context_a1
36+
37+
_new_vmalloc_kernel_address:
38+
/* Is it a kernel address? */
39+
csrr a0, CSR_TVAL
40+
bge a0, zero, _new_vmalloc_restore_context_a1
41+
42+
/* Check if a new vmalloc mapping appeared that could explain the trap */
43+
REG_S a2, TASK_TI_A2(tp)
44+
/*
45+
* Computes:
46+
* a0 = &new_vmalloc[BIT_WORD(cpu)]
47+
* a1 = BIT_MASK(cpu)
48+
*/
49+
REG_L a2, TASK_TI_CPU(tp)
50+
/*
51+
* Compute the new_vmalloc element position:
52+
* (cpu / 64) * 8 = (cpu >> 6) << 3
53+
*/
54+
srli a1, a2, 6
55+
slli a1, a1, 3
56+
la a0, new_vmalloc
57+
add a0, a0, a1
58+
/*
59+
* Compute the bit position in the new_vmalloc element:
60+
* bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6
61+
* = cpu - ((cpu >> 6) << 3) << 3
62+
*/
63+
slli a1, a1, 3
64+
sub a1, a2, a1
65+
/* Compute the "get mask": 1 << bit_pos */
66+
li a2, 1
67+
sll a1, a2, a1
68+
69+
/* Check the value of new_vmalloc for this cpu */
70+
REG_L a2, 0(a0)
71+
and a2, a2, a1
72+
beq a2, zero, _new_vmalloc_restore_context
73+
74+
/* Atomically reset the current cpu bit in new_vmalloc */
75+
amoxor.d a0, a1, (a0)
76+
77+
/* Only emit a sfence.vma if the uarch caches invalid entries */
78+
ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1)
79+
80+
REG_L a0, TASK_TI_A0(tp)
81+
REG_L a1, TASK_TI_A1(tp)
82+
REG_L a2, TASK_TI_A2(tp)
83+
csrw CSR_SCRATCH, x0
84+
sret
85+
86+
_new_vmalloc_restore_context:
87+
REG_L a2, TASK_TI_A2(tp)
88+
_new_vmalloc_restore_context_a1:
89+
REG_L a1, TASK_TI_A1(tp)
90+
_new_vmalloc_restore_context_a0:
91+
REG_L a0, TASK_TI_A0(tp)
92+
.endm
93+
94+
2295
SYM_CODE_START(handle_exception)
2396
/*
2497
* If coming from userspace, preserve the user thread pointer and load
@@ -30,6 +103,20 @@ SYM_CODE_START(handle_exception)
30103

31104
.Lrestore_kernel_tpsp:
32105
csrr tp, CSR_SCRATCH
106+
107+
#ifdef CONFIG_64BIT
108+
/*
109+
* The RISC-V kernel does not eagerly emit a sfence.vma after each
110+
* new vmalloc mapping, which may result in exceptions:
111+
* - if the uarch caches invalid entries, the new mapping would not be
112+
* observed by the page table walker and an invalidation is needed.
113+
* - if the uarch does not cache invalid entries, a reordered access
114+
* could "miss" the new mapping and traps: in that case, we only need
115+
* to retry the access, no sfence.vma is required.
116+
*/
117+
new_vmalloc_check
118+
#endif
119+
33120
REG_S sp, TASK_TI_KERNEL_SP(tp)
34121

35122
#ifdef CONFIG_VMAP_STACK

arch/riscv/mm/init.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737

3838
#include "../kernel/head.h"
3939

40+
u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
41+
4042
struct kernel_mapping kernel_map __ro_after_init;
4143
EXPORT_SYMBOL(kernel_map);
4244
#ifdef CONFIG_XIP_KERNEL

arch/riscv/mm/pgtable.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,26 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
99
unsigned long address, pte_t *ptep,
1010
pte_t entry, int dirty)
1111
{
12+
asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1)
13+
: : : : svvptc);
14+
1215
if (!pte_same(ptep_get(ptep), entry))
1316
__set_pte_at(vma->vm_mm, ptep, entry);
1417
/*
1518
* update_mmu_cache will unconditionally execute, handling both
1619
* the case that the PTE changed and the spurious fault case.
1720
*/
1821
return true;
22+
23+
svvptc:
24+
if (!pte_same(ptep_get(ptep), entry)) {
25+
__set_pte_at(vma->vm_mm, ptep, entry);
26+
/* Here only not svadu is impacted */
27+
flush_tlb_page(vma, address);
28+
return true;
29+
}
30+
31+
return false;
1932
}
2033

2134
int ptep_test_and_clear_young(struct vm_area_struct *vma,

0 commit comments

Comments
 (0)