Skip to content

Commit 5aa2323

Browse files
mrutland-armgregkh
authored andcommitted
arm64: rework EL0 MRS emulation
commit f5962ad upstream. On CPUs without FEAT_IDST, ID register emulation is slower than it needs to be, as all threads contend for the same lock to perform the emulation. This patch reworks the emulation to avoid this unnecessary contention. On CPUs with FEAT_IDST (which is mandatory from ARMv8.4 onwards), EL0 accesses to ID registers result in a SYS trap, and emulation of these is handled with a sys64_hook. These hooks are statically allocated, and no locking is required to iterate through the hooks and perform the emulation, allowing emulation to occur in parallel with no contention. On CPUs without FEAT_IDST, EL0 accesses to ID registers result in an UNDEFINED exception, and emulation of these accesses is handled with an undef_hook. When an EL0 MRS instruction is trapped to EL1, the kernel finds the relevant handler by iterating through all of the undef_hooks, requiring undef_lock to be held during this lookup. This locking is only required to safely traverse the list of undef_hooks (as it can be concurrently modified), and the actual emulation of the MRS does not require any mutual exclusion. This locking is an unfortunate bottleneck, especially given that MRS emulation is enabled unconditionally and is never disabled. This patch reworks the non-FEAT_IDST MRS emulation logic so that it can be invoked directly from do_el0_undef(). This removes the bottleneck, allowing MRS traps to be handled entirely in parallel, and is a stepping stone to making all of the undef_hooks lock-free. I've tested this in a 64-vCPU VM on a 64-CPU ThunderX2 host, with a benchmark which spawns a number of threads which each try to read ID_AA64ISAR0_EL1 1000000 times. This is vastly more contention than will ever be seen in realistic usage, but clearly demonstrates the removal of the bottleneck: | Threads || Time (seconds) | | || Before || After | | || Real | System || Real | System | |---------++--------+---------++--------+---------| | 1 || 0.29 | 0.20 || 0.24 | 0.12 | | 2 || 0.35 | 0.51 || 0.23 | 0.27 | | 4 || 1.08 | 3.87 || 0.24 | 0.56 | | 8 || 4.31 | 33.60 || 0.24 | 1.11 | | 16 || 9.47 | 149.39 || 0.23 | 2.15 | | 32 || 19.07 | 605.27 || 0.24 | 4.38 | | 64 || 65.40 | 3609.09 || 0.33 | 11.27 | Aside from the speedup, there should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <[email protected]> Cc: Catalin Marinas <[email protected]> Cc: James Morse <[email protected]> Cc: Joey Gouly <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Will Deacon <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Will Deacon <[email protected]> Signed-off-by: Jinjie Ruan <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 15e9649 commit 5aa2323

File tree

3 files changed

+10
-19
lines changed

3 files changed

+10
-19
lines changed

arch/arm64/include/asm/cpufeature.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,8 @@ static inline bool system_supports_tlb_range(void)
759759
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
760760
}
761761

762-
extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
762+
int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
763+
bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
763764

764765
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
765766
{

arch/arm64/kernel/cpufeature.c

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2852,35 +2852,22 @@ int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt)
28522852
return rc;
28532853
}
28542854

2855-
static int emulate_mrs(struct pt_regs *regs, u32 insn)
2855+
bool try_emulate_mrs(struct pt_regs *regs, u32 insn)
28562856
{
28572857
u32 sys_reg, rt;
28582858

2859+
if (compat_user_mode(regs) || !aarch64_insn_is_mrs(insn))
2860+
return false;
2861+
28592862
/*
28602863
* sys_reg values are defined as used in mrs/msr instruction.
28612864
* shift the imm value to get the encoding.
28622865
*/
28632866
sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5;
28642867
rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn);
2865-
return do_emulate_mrs(regs, sys_reg, rt);
2868+
return do_emulate_mrs(regs, sys_reg, rt) == 0;
28662869
}
28672870

2868-
static struct undef_hook mrs_hook = {
2869-
.instr_mask = 0xfff00000,
2870-
.instr_val = 0xd5300000,
2871-
.pstate_mask = PSR_AA32_MODE_MASK,
2872-
.pstate_val = PSR_MODE_EL0t,
2873-
.fn = emulate_mrs,
2874-
};
2875-
2876-
static int __init enable_mrs_emulation(void)
2877-
{
2878-
register_undef_hook(&mrs_hook);
2879-
return 0;
2880-
}
2881-
2882-
core_initcall(enable_mrs_emulation);
2883-
28842871
ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr,
28852872
char *buf)
28862873
{

arch/arm64/kernel/traps.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,9 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr)
408408
if (user_insn_read(regs, &insn))
409409
goto out_err;
410410

411+
if (try_emulate_mrs(regs, insn))
412+
return;
413+
411414
if (call_undef_hook(regs, insn) == 0)
412415
return;
413416

0 commit comments

Comments
 (0)