Skip to content

Commit 503638e

Browse files
Alexandre Ghitipalmer-dabbelt
authored andcommitted
riscv: Stop emitting preventive sfence.vma for new vmalloc mappings
In 6.5, we removed the vmalloc fault path because that can't work (see [1] [2]). Then in order to make sure that new page table entries were seen by the page table walker, we had to preventively emit a sfence.vma on all harts [3] but this solution is very costly since it relies on IPI. And even there, we could end up in a loop of vmalloc faults if a vmalloc allocation is done in the IPI path (for example if it is traced, see [4]), which could result in a kernel stack overflow. Those preventive sfence.vma needed to be emitted because: - if the uarch caches invalid entries, the new mapping may not be observed by the page table walker and an invalidation may be needed. - if the uarch does not cache invalid entries, a reordered access could "miss" the new mapping and traps: in that case, we would actually only need to retry the access, no sfence.vma is required. So this patch removes those preventive sfence.vma and actually handles the possible (and unlikely) exceptions. And since the kernel stacks mappings lie in the vmalloc area, this handling must be done very early when the trap is taken, at the very beginning of handle_exception: this also rules out the vmalloc allocations in the fault path. Link: https://lore.kernel.org/linux-riscv/[email protected]/ [1] Link: https://lore.kernel.org/linux-riscv/[email protected] [2] Link: https://lore.kernel.org/linux-riscv/[email protected]/ [3] Link: https://lore.kernel.org/lkml/[email protected]/ [4] Signed-off-by: Alexandre Ghiti <[email protected]> Reviewed-by: Yunhui Cui <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Palmer Dabbelt <[email protected]>
1 parent d25599b commit 503638e

File tree

5 files changed

+120
-1
lines changed

5 files changed

+120
-1
lines changed

arch/riscv/include/asm/cacheflush.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,23 @@ do { \
4646
} while (0)
4747

4848
#ifdef CONFIG_64BIT
49-
#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
49+
extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
50+
extern char _end[];
51+
#define flush_cache_vmap flush_cache_vmap
52+
static inline void flush_cache_vmap(unsigned long start, unsigned long end)
53+
{
54+
if (is_vmalloc_or_module_addr((void *)start)) {
55+
int i;
56+
57+
/*
58+
* We don't care if concurrently a cpu resets this value since
59+
* the only place this can happen is in handle_exception() where
60+
* an sfence.vma is emitted.
61+
*/
62+
for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i)
63+
new_vmalloc[i] = -1ULL;
64+
}
65+
}
5066
#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end)
5167
#endif
5268

arch/riscv/include/asm/thread_info.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ struct thread_info {
6060
void *scs_base;
6161
void *scs_sp;
6262
#endif
63+
#ifdef CONFIG_64BIT
64+
/*
65+
* Used in handle_exception() to save a0, a1 and a2 before knowing if we
66+
* can access the kernel stack.
67+
*/
68+
unsigned long a0, a1, a2;
69+
#endif
6370
};
6471

6572
#ifdef CONFIG_SHADOW_CALL_STACK

arch/riscv/kernel/asm-offsets.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,20 @@ void asm_offsets(void)
3636
OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
3737
OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
3838
OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
39+
40+
OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
3941
OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
4042
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
4143
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
4244
OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp);
4345
#ifdef CONFIG_SHADOW_CALL_STACK
4446
OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp);
4547
#endif
48+
#ifdef CONFIG_64BIT
49+
OFFSET(TASK_TI_A0, task_struct, thread_info.a0);
50+
OFFSET(TASK_TI_A1, task_struct, thread_info.a1);
51+
OFFSET(TASK_TI_A2, task_struct, thread_info.a2);
52+
#endif
4653

4754
OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
4855
OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);

arch/riscv/kernel/entry.S

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,79 @@
1919

2020
.section .irqentry.text, "ax"
2121

22+
.macro new_vmalloc_check
23+
REG_S a0, TASK_TI_A0(tp)
24+
csrr a0, CSR_CAUSE
25+
/* Exclude IRQs */
26+
blt a0, zero, _new_vmalloc_restore_context_a0
27+
28+
REG_S a1, TASK_TI_A1(tp)
29+
/* Only check new_vmalloc if we are in page/protection fault */
30+
li a1, EXC_LOAD_PAGE_FAULT
31+
beq a0, a1, _new_vmalloc_kernel_address
32+
li a1, EXC_STORE_PAGE_FAULT
33+
beq a0, a1, _new_vmalloc_kernel_address
34+
li a1, EXC_INST_PAGE_FAULT
35+
bne a0, a1, _new_vmalloc_restore_context_a1
36+
37+
_new_vmalloc_kernel_address:
38+
/* Is it a kernel address? */
39+
csrr a0, CSR_TVAL
40+
bge a0, zero, _new_vmalloc_restore_context_a1
41+
42+
/* Check if a new vmalloc mapping appeared that could explain the trap */
43+
REG_S a2, TASK_TI_A2(tp)
44+
/*
45+
* Computes:
46+
* a0 = &new_vmalloc[BIT_WORD(cpu)]
47+
* a1 = BIT_MASK(cpu)
48+
*/
49+
REG_L a2, TASK_TI_CPU(tp)
50+
/*
51+
* Compute the new_vmalloc element position:
52+
* (cpu / 64) * 8 = (cpu >> 6) << 3
53+
*/
54+
srli a1, a2, 6
55+
slli a1, a1, 3
56+
la a0, new_vmalloc
57+
add a0, a0, a1
58+
/*
59+
* Compute the bit position in the new_vmalloc element:
60+
* bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6
61+
* = cpu - ((cpu >> 6) << 3) << 3
62+
*/
63+
slli a1, a1, 3
64+
sub a1, a2, a1
65+
/* Compute the "get mask": 1 << bit_pos */
66+
li a2, 1
67+
sll a1, a2, a1
68+
69+
/* Check the value of new_vmalloc for this cpu */
70+
REG_L a2, 0(a0)
71+
and a2, a2, a1
72+
beq a2, zero, _new_vmalloc_restore_context
73+
74+
/* Atomically reset the current cpu bit in new_vmalloc */
75+
amoxor.d a0, a1, (a0)
76+
77+
/* Only emit a sfence.vma if the uarch caches invalid entries */
78+
ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1)
79+
80+
REG_L a0, TASK_TI_A0(tp)
81+
REG_L a1, TASK_TI_A1(tp)
82+
REG_L a2, TASK_TI_A2(tp)
83+
csrw CSR_SCRATCH, x0
84+
sret
85+
86+
_new_vmalloc_restore_context:
87+
REG_L a2, TASK_TI_A2(tp)
88+
_new_vmalloc_restore_context_a1:
89+
REG_L a1, TASK_TI_A1(tp)
90+
_new_vmalloc_restore_context_a0:
91+
REG_L a0, TASK_TI_A0(tp)
92+
.endm
93+
94+
2295
SYM_CODE_START(handle_exception)
2396
/*
2497
* If coming from userspace, preserve the user thread pointer and load
@@ -30,6 +103,20 @@ SYM_CODE_START(handle_exception)
30103

31104
.Lrestore_kernel_tpsp:
32105
csrr tp, CSR_SCRATCH
106+
107+
#ifdef CONFIG_64BIT
108+
/*
109+
* The RISC-V kernel does not eagerly emit a sfence.vma after each
110+
* new vmalloc mapping, which may result in exceptions:
111+
* - if the uarch caches invalid entries, the new mapping would not be
112+
* observed by the page table walker and an invalidation is needed.
113+
* - if the uarch does not cache invalid entries, a reordered access
114+
* could "miss" the new mapping and traps: in that case, we only need
115+
* to retry the access, no sfence.vma is required.
116+
*/
117+
new_vmalloc_check
118+
#endif
119+
33120
REG_S sp, TASK_TI_KERNEL_SP(tp)
34121

35122
#ifdef CONFIG_VMAP_STACK

arch/riscv/mm/init.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636

3737
#include "../kernel/head.h"
3838

39+
u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
40+
3941
struct kernel_mapping kernel_map __ro_after_init;
4042
EXPORT_SYMBOL(kernel_map);
4143
#ifdef CONFIG_XIP_KERNEL

0 commit comments

Comments
 (0)