Skip to content

Commit c4885bb

Browse files
liupingfanctmarinas
authored andcommitted
arm64/mm: save memory access in check_and_switch_context() fast switch path
On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable, using the per-cpu offset stored in the tpidr_el1 system register. In some cases we generate a per-cpu address with a sequence like: cpu_ptr = &per_cpu(ptr, smp_processor_id()); Which potentially incurs a cache miss for both `cpu_number` and the in-memory `__per_cpu_offset` array. This can be written more optimally as: cpu_ptr = this_cpu_ptr(ptr); Which only needs the offset from tpidr_el1, and does not need to load from memory. The following two test cases show a small performance improvement measured on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel. Test 1: (about 0.3% improvement) #cat b.sh make clean && make all -j138 #perf stat --repeat 10 --null --sync sh b.sh - before this patch Performance counter stats for 'sh b.sh' (10 runs): 298.62 +- 1.86 seconds time elapsed ( +- 0.62% ) - after this patch Performance counter stats for 'sh b.sh' (10 runs): 297.734 +- 0.954 seconds time elapsed ( +- 0.32% ) Test 2: (about 1.69% improvement) 'perf stat -r 10 perf bench sched messaging' Then sum the total time of 'sched/messaging' by manual. - before this patch total 0.707 sec for 10 times - after this patch totol 0.695 sec for 10 times Signed-off-by: Pingfan Liu <[email protected]> Acked-by: Mark Rutland <[email protected]> Cc: Will Deacon <[email protected]> Cc: Steve Capper <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Vladimir Murzin <[email protected]> Cc: Jean-Philippe Brucker <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Catalin Marinas <[email protected]>
1 parent ea0eada commit c4885bb

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

arch/arm64/include/asm/mmu_context.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
175175
* take CPU migration into account.
176176
*/
177177
#define destroy_context(mm) do { } while(0)
178-
void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
178+
void check_and_switch_context(struct mm_struct *mm);
179179

180180
#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
181181

@@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
214214

215215
static inline void __switch_mm(struct mm_struct *next)
216216
{
217-
unsigned int cpu = smp_processor_id();
218-
219217
/*
220218
* init_mm.pgd does not contain any user mappings and it is always
221219
* active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
225223
return;
226224
}
227225

228-
check_and_switch_context(next, cpu);
226+
check_and_switch_context(next);
229227
}
230228

231229
static inline void

arch/arm64/mm/context.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,10 @@ static u64 new_context(struct mm_struct *mm)
198198
return idx2asid(asid) | generation;
199199
}
200200

201-
void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
201+
void check_and_switch_context(struct mm_struct *mm)
202202
{
203203
unsigned long flags;
204+
unsigned int cpu;
204205
u64 asid, old_active_asid;
205206

206207
if (system_supports_cnp())
@@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
222223
* relaxed xchg in flush_context will treat us as reserved
223224
* because atomic RmWs are totally ordered for a given location.
224225
*/
225-
old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
226+
old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
226227
if (old_active_asid && asid_gen_match(asid) &&
227-
atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
228+
atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
228229
old_active_asid, asid))
229230
goto switch_mm_fastpath;
230231

@@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
236237
atomic64_set(&mm->context.id, asid);
237238
}
238239

240+
cpu = smp_processor_id();
239241
if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
240242
local_flush_tlb_all();
241243

242-
atomic64_set(&per_cpu(active_asids, cpu), asid);
244+
atomic64_set(this_cpu_ptr(&active_asids), asid);
243245
raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
244246

245247
switch_mm_fastpath:

0 commit comments

Comments
 (0)