From b1137eb23924f93d0d39633c48373aa491c835f4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:16 -0500 Subject: [PATCH 1/3] x86/mm/tlb: Update mm_cpumask lazily mainline inclusion from mainline-v6.14-rc1 category: performance On busy multi-threaded workloads, there can be significant contention on the mm_cpumask at context switch time. Reduce that contention by updating mm_cpumask lazily, setting the CPU bit at context switch time (if not already set), and clearing the CPU bit at the first TLB flush sent to a CPU where the process isn't running. When a flurry of TLB flushes for a process happen, only the first one will be sent to CPUs where the process isn't running. The others will be sent to CPUs where the process is currently running. On an AMD Milan system with 36 cores, there is a noticeable difference: $ hackbench --groups 20 --loops 10000 Before: ~4.5s +/- 0.1s After: ~4.2s +/- 0.1s Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mel Gorman Link: https://lore.kernel.org/r/20241114152723.1294686-2-riel@surriel.com (cherry picked from commit 209954cbc7d0ce1a190fc725d20ce303d74d2680) Signed-off-by: Wentao Guan --- arch/x86/kernel/alternative.c | 10 +++++++--- arch/x86/mm/tlb.c | 19 +++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6ab96bc764cfa..29170ebaf30aa 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2053,11 +2053,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) return temp_state; } +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); + /* * Restore the breakpoints if they were disabled before the temporary mm * was loaded. @@ -2066,9 +2073,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) hw_breakpoint_restore(); } -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - static void text_poke_memcpy(void *dst, const void *src, size_t len) { memcpy(dst, src, len); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8629d90fdcd92..e8323f54026e7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -607,18 +607,15 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Stop remote flushes for the previous mm. - * Skip kernel threads; we never send init_mm TLB flushing IPIs, - * but the bitmap manipulation can cause cache line contention. + * Leave this CPU in prev's mm_cpumask. Atomic writes to + * mm_cpumask can be expensive under contention. The CPU + * will be removed lazily at TLB flush time. */ - if (prev != &init_mm) { - VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, - mm_cpumask(prev))); - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - } + VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, + mm_cpumask(prev))); /* Start receiving IPIs and then read tlb_gen (and LAM below) */ - if (next != &init_mm) + if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); @@ -766,8 +763,10 @@ static void flush_tlb_func(void *info) count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); /* Can only happen on remote CPUs */ - if (f->mm && f->mm != loaded_mm) + if (f->mm && f->mm != loaded_mm) { + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); return; + } } if (unlikely(loaded_mm == &init_mm)) From aee5250588874bcc4e56a5f8a288ffeee690147f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:17 -0500 Subject: [PATCH 2/3] x86/mm/tlb: Add tracepoint for TLB flush IPI to stale CPU mainline inclusion from mainline-v6.14-rc1 category: performance Add a tracepoint when we send a TLB flush IPI to a CPU that used to be in the mm_cpumask, but isn't any more. Suggested-by: Dave Hansen Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241114152723.1294686-3-riel@surriel.com (cherry picked from commit 2815a56e4b7252a836969f5674ee356ea1ce482c) Signed-off-by: Wentao Guan --- arch/x86/mm/tlb.c | 1 + include/linux/mm_types.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e8323f54026e7..b2aee2d56ec96 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -765,6 +765,7 @@ static void flush_tlb_func(void *info) /* Can only happen on remote CPUs */ if (f->mm && f->mm != loaded_mm) { cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); + trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); return; } } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6894de506b364..70da217e687af 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1365,6 +1365,7 @@ enum tlb_flush_reason { TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, + TLB_REMOTE_WRONG_CPU, NR_TLB_FLUSH_REASONS, }; From 428eb2bb1c85fc1eb6f917dbf51ed00d5da1f511 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Apr 2025 14:34:13 +0200 Subject: [PATCH 3/3] x86/mm: Remove the mm_cpumask(prev) warning from switch_mm_irqs_off() mainline inclusion from mainline-v6.16-rc1 category: bugfix The CONFIG_DEBUG_VM=y warning in switch_mm_irqs_off() started triggering in testing: VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(prev))); AFAIU what happens is that unuse_temporary_mm() clears the mm_cpumask() for the current CPU, while switch_mm_irqs_off() then checks that the mm_cpumask() bit is set for the current CPU. While this behaviour hasn't really changed since the following commit: 209954cbc7d0 ("x86/mm/tlb: Update mm_cpumask lazily") introduced both, but the warning is wrong, so remove it. [ mingo: Patchified Peter's email. ] Reported-by: syzbot+c2537ce72a879a38113e@syzkaller.appspotmail.com Reported-by: Borislav Petkov Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Juergen Gross Cc: Andrew Cooper Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/20250414135629.GA17910@noisy.programming.kicks-ass.net (cherry picked from commit 52ebfe7412ce4b3af54fe962af58efe9b25cd9a9) Signed-off-by: Wentao Guan Conflicts: arch/x86/mm/tlb.c --- arch/x86/mm/tlb.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b2aee2d56ec96..b6c593147661a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -606,14 +606,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, */ cond_mitigation(tsk); - /* - * Leave this CPU in prev's mm_cpumask. Atomic writes to - * mm_cpumask can be expensive under contention. The CPU - * will be removed lazily at TLB flush time. - */ - VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, - mm_cpumask(prev))); - /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next));