Skip to content

Commit 209954c

Browse files
rikvanrielIngo Molnar
authored andcommitted
x86/mm/tlb: Update mm_cpumask lazily
On busy multi-threaded workloads, there can be significant contention on the mm_cpumask at context switch time. Reduce that contention by updating mm_cpumask lazily, setting the CPU bit at context switch time (if not already set), and clearing the CPU bit at the first TLB flush sent to a CPU where the process isn't running. When a flurry of TLB flushes for a process happen, only the first one will be sent to CPUs where the process isn't running. The others will be sent to CPUs where the process is currently running. On an AMD Milan system with 36 cores, there is a noticeable difference: $ hackbench --groups 20 --loops 10000 Before: ~4.5s +/- 0.1s After: ~4.2s +/- 0.1s Signed-off-by: Rik van Riel <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Mel Gorman <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 7e33001 commit 209954c

File tree

2 files changed

+16
-13
lines changed

2 files changed

+16
-13
lines changed

arch/x86/kernel/alternative.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1825,11 +1825,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
18251825
return temp_state;
18261826
}
18271827

1828+
__ro_after_init struct mm_struct *poking_mm;
1829+
__ro_after_init unsigned long poking_addr;
1830+
18281831
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
18291832
{
18301833
lockdep_assert_irqs_disabled();
1834+
18311835
switch_mm_irqs_off(NULL, prev_state.mm, current);
18321836

1837+
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
1838+
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
1839+
18331840
/*
18341841
* Restore the breakpoints if they were disabled before the temporary mm
18351842
* was loaded.
@@ -1838,9 +1845,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
18381845
hw_breakpoint_restore();
18391846
}
18401847

1841-
__ro_after_init struct mm_struct *poking_mm;
1842-
__ro_after_init unsigned long poking_addr;
1843-
18441848
static void text_poke_memcpy(void *dst, const void *src, size_t len)
18451849
{
18461850
memcpy(dst, src, len);

arch/x86/mm/tlb.c

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -606,18 +606,15 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
606606
cond_mitigation(tsk);
607607

608608
/*
609-
* Stop remote flushes for the previous mm.
610-
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
611-
* but the bitmap manipulation can cause cache line contention.
609+
* Leave this CPU in prev's mm_cpumask. Atomic writes to
610+
* mm_cpumask can be expensive under contention. The CPU
611+
* will be removed lazily at TLB flush time.
612612
*/
613-
if (prev != &init_mm) {
614-
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
615-
mm_cpumask(prev)));
616-
cpumask_clear_cpu(cpu, mm_cpumask(prev));
617-
}
613+
VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu,
614+
mm_cpumask(prev)));
618615

619616
/* Start receiving IPIs and then read tlb_gen (and LAM below) */
620-
if (next != &init_mm)
617+
if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
621618
cpumask_set_cpu(cpu, mm_cpumask(next));
622619
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
623620

@@ -761,8 +758,10 @@ static void flush_tlb_func(void *info)
761758
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
762759

763760
/* Can only happen on remote CPUs */
764-
if (f->mm && f->mm != loaded_mm)
761+
if (f->mm && f->mm != loaded_mm) {
762+
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));
765763
return;
764+
}
766765
}
767766

768767
if (unlikely(loaded_mm == &init_mm))

0 commit comments

Comments
 (0)