Skip to content

Commit 4ce94ea

Browse files
anadavIngo Molnar
authored andcommitted
x86/mm/tlb: Flush remote and local TLBs concurrently
To improve TLB shootdown performance, flush the remote and local TLBs concurrently. Introduce flush_tlb_multi() that does so. Introduce paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen and hyper-v are only compile-tested). While the updated smp infrastructure is capable of running a function on a single local core, it is not optimized for this case. The multiple function calls and the indirect branch introduce some overhead, and might make local TLB flushes slower than they were before the recent changes. Before calling the SMP infrastructure, check if only a local TLB flush is needed to restore the lost performance in this common case. This requires to check mm_cpumask() one more time, but unless this mask is updated very frequently, this should impact performance negatively. Signed-off-by: Nadav Amit <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Reviewed-by: Michael Kelley <[email protected]> # Hyper-v parts Reviewed-by: Juergen Gross <[email protected]> # Xen and paravirt parts Reviewed-by: Dave Hansen <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 6035152 commit 4ce94ea

File tree

10 files changed

+57
-41
lines changed

10 files changed

+57
-41
lines changed

arch/x86/hyperv/mmu.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,16 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
5252
return gva_n - offset;
5353
}
5454

55-
static void hyperv_flush_tlb_others(const struct cpumask *cpus,
56-
const struct flush_tlb_info *info)
55+
static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
56+
const struct flush_tlb_info *info)
5757
{
5858
int cpu, vcpu, gva_n, max_gvas;
5959
struct hv_tlb_flush **flush_pcpu;
6060
struct hv_tlb_flush *flush;
6161
u64 status = U64_MAX;
6262
unsigned long flags;
6363

64-
trace_hyperv_mmu_flush_tlb_others(cpus, info);
64+
trace_hyperv_mmu_flush_tlb_multi(cpus, info);
6565

6666
if (!hv_hypercall_pg)
6767
goto do_native;
@@ -164,7 +164,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
164164
if (!(status & HV_HYPERCALL_RESULT_MASK))
165165
return;
166166
do_native:
167-
native_flush_tlb_others(cpus, info);
167+
native_flush_tlb_multi(cpus, info);
168168
}
169169

170170
static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
@@ -239,6 +239,6 @@ void hyperv_setup_mmu_ops(void)
239239
return;
240240

241241
pr_info("Using hypercall for remote TLB flush\n");
242-
pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
242+
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
243243
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
244244
}

arch/x86/include/asm/paravirt.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ static inline void slow_down_io(void)
5050
void native_flush_tlb_local(void);
5151
void native_flush_tlb_global(void);
5252
void native_flush_tlb_one_user(unsigned long addr);
53-
void native_flush_tlb_others(const struct cpumask *cpumask,
53+
void native_flush_tlb_multi(const struct cpumask *cpumask,
5454
const struct flush_tlb_info *info);
5555

5656
static inline void __flush_tlb_local(void)
@@ -68,10 +68,10 @@ static inline void __flush_tlb_one_user(unsigned long addr)
6868
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
6969
}
7070

71-
static inline void __flush_tlb_others(const struct cpumask *cpumask,
71+
static inline void __flush_tlb_multi(const struct cpumask *cpumask,
7272
const struct flush_tlb_info *info)
7373
{
74-
PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
74+
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
7575
}
7676

7777
static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)

arch/x86/include/asm/paravirt_types.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ struct pv_mmu_ops {
188188
void (*flush_tlb_user)(void);
189189
void (*flush_tlb_kernel)(void);
190190
void (*flush_tlb_one_user)(unsigned long addr);
191-
void (*flush_tlb_others)(const struct cpumask *cpus,
192-
const struct flush_tlb_info *info);
191+
void (*flush_tlb_multi)(const struct cpumask *cpus,
192+
const struct flush_tlb_info *info);
193193

194194
void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
195195

arch/x86/include/asm/tlbflush.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ extern void initialize_tlbstate_and_flush(void);
175175
* - flush_tlb_page(vma, vmaddr) flushes one page
176176
* - flush_tlb_range(vma, start, end) flushes a range of pages
177177
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
178-
* - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
178+
* - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus
179179
*
180180
* ..but the i386 has somewhat limited tlb flushing capabilities,
181181
* and page-granular flushes are available only on i486 and up.
@@ -209,7 +209,7 @@ struct flush_tlb_info {
209209
void flush_tlb_local(void);
210210
void flush_tlb_one_user(unsigned long addr);
211211
void flush_tlb_one_kernel(unsigned long addr);
212-
void flush_tlb_others(const struct cpumask *cpumask,
212+
void flush_tlb_multi(const struct cpumask *cpumask,
213213
const struct flush_tlb_info *info);
214214

215215
#ifdef CONFIG_PARAVIRT

arch/x86/include/asm/trace/hyperv.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#if IS_ENABLED(CONFIG_HYPERV)
1010

11-
TRACE_EVENT(hyperv_mmu_flush_tlb_others,
11+
TRACE_EVENT(hyperv_mmu_flush_tlb_multi,
1212
TP_PROTO(const struct cpumask *cpus,
1313
const struct flush_tlb_info *info),
1414
TP_ARGS(cpus, info),

arch/x86/kernel/kvm.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
613613
}
614614
#endif
615615

616-
static void kvm_flush_tlb_others(const struct cpumask *cpumask,
616+
static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
617617
const struct flush_tlb_info *info)
618618
{
619619
u8 state;
@@ -627,6 +627,11 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
627627
* queue flush_on_enter for pre-empted vCPUs
628628
*/
629629
for_each_cpu(cpu, flushmask) {
630+
/*
631+
* The local vCPU is never preempted, so we do not explicitly
632+
* skip check for local vCPU - it will never be cleared from
633+
* flushmask.
634+
*/
630635
src = &per_cpu(steal_time, cpu);
631636
state = READ_ONCE(src->preempted);
632637
if ((state & KVM_VCPU_PREEMPTED)) {
@@ -636,7 +641,7 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
636641
}
637642
}
638643

639-
native_flush_tlb_others(flushmask, info);
644+
native_flush_tlb_multi(flushmask, info);
640645
}
641646

642647
static void __init kvm_guest_init(void)
@@ -654,7 +659,7 @@ static void __init kvm_guest_init(void)
654659
}
655660

656661
if (pv_tlb_flush_supported()) {
657-
pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
662+
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
658663
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
659664
pr_info("KVM setup pv remote TLB flush\n");
660665
}

arch/x86/kernel/paravirt.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ struct paravirt_patch_template pv_ops = {
330330
.mmu.flush_tlb_user = native_flush_tlb_local,
331331
.mmu.flush_tlb_kernel = native_flush_tlb_global,
332332
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
333-
.mmu.flush_tlb_others = native_flush_tlb_others,
333+
.mmu.flush_tlb_multi = native_flush_tlb_multi,
334334
.mmu.tlb_remove_table =
335335
(void (*)(struct mmu_gather *, void *))tlb_remove_page,
336336

arch/x86/mm/tlb.c

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# define __flush_tlb_local native_flush_tlb_local
2525
# define __flush_tlb_global native_flush_tlb_global
2626
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
27-
# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
27+
# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
2828
#endif
2929

3030
/*
@@ -490,7 +490,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
490490
/*
491491
* Even in lazy TLB mode, the CPU should stay set in the
492492
* mm_cpumask. The TLB shootdown code can figure out from
493-
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
493+
* cpu_tlbstate.is_lazy whether or not to send an IPI.
494494
*/
495495
if (WARN_ON_ONCE(real_prev != &init_mm &&
496496
!cpumask_test_cpu(cpu, mm_cpumask(next))))
@@ -697,7 +697,7 @@ static void flush_tlb_func(void *info)
697697
* garbage into our TLB. Since switching to init_mm is barely
698698
* slower than a minimal flush, just switch to init_mm.
699699
*
700-
* This should be rare, with native_flush_tlb_others skipping
700+
* This should be rare, with native_flush_tlb_multi() skipping
701701
* IPIs to lazy TLB mode CPUs.
702702
*/
703703
switch_mm_irqs_off(NULL, &init_mm, NULL);
@@ -795,9 +795,14 @@ static bool tlb_is_not_lazy(int cpu)
795795

796796
static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
797797

798-
STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
798+
STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
799799
const struct flush_tlb_info *info)
800800
{
801+
/*
802+
* Do accounting and tracing. Note that there are (and have always been)
803+
* cases in which a remote TLB flush will be traced, but eventually
804+
* would not happen.
805+
*/
801806
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
802807
if (info->end == TLB_FLUSH_ALL)
803808
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -816,8 +821,7 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
816821
* doing a speculative memory access.
817822
*/
818823
if (info->freed_tables) {
819-
smp_call_function_many(cpumask, flush_tlb_func,
820-
(void *)info, 1);
824+
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
821825
} else {
822826
/*
823827
* Although we could have used on_each_cpu_cond_mask(),
@@ -844,14 +848,14 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
844848
if (tlb_is_not_lazy(cpu))
845849
__cpumask_set_cpu(cpu, cond_cpumask);
846850
}
847-
smp_call_function_many(cond_cpumask, flush_tlb_func, (void *)info, 1);
851+
on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
848852
}
849853
}
850854

851-
void flush_tlb_others(const struct cpumask *cpumask,
855+
void flush_tlb_multi(const struct cpumask *cpumask,
852856
const struct flush_tlb_info *info)
853857
{
854-
__flush_tlb_others(cpumask, info);
858+
__flush_tlb_multi(cpumask, info);
855859
}
856860

857861
/*
@@ -931,16 +935,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
931935
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
932936
new_tlb_gen);
933937

934-
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
938+
/*
939+
* flush_tlb_multi() is not optimized for the common case in which only
940+
* a local TLB flush is needed. Optimize this use-case by calling
941+
* flush_tlb_func_local() directly in this case.
942+
*/
943+
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
944+
flush_tlb_multi(mm_cpumask(mm), info);
945+
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
935946
lockdep_assert_irqs_enabled();
936947
local_irq_disable();
937948
flush_tlb_func(info);
938949
local_irq_enable();
939950
}
940951

941-
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
942-
flush_tlb_others(mm_cpumask(mm), info);
943-
944952
put_flush_tlb_info();
945953
put_cpu();
946954
}
@@ -1152,16 +1160,20 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
11521160
int cpu = get_cpu();
11531161

11541162
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
1155-
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
1163+
/*
1164+
* flush_tlb_multi() is not optimized for the common case in which only
1165+
* a local TLB flush is needed. Optimize this use-case by calling
1166+
* flush_tlb_func_local() directly in this case.
1167+
*/
1168+
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
1169+
flush_tlb_multi(&batch->cpumask, info);
1170+
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
11561171
lockdep_assert_irqs_enabled();
11571172
local_irq_disable();
11581173
flush_tlb_func(info);
11591174
local_irq_enable();
11601175
}
11611176

1162-
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
1163-
flush_tlb_others(&batch->cpumask, info);
1164-
11651177
cpumask_clear(&batch->cpumask);
11661178

11671179
put_flush_tlb_info();

arch/x86/xen/mmu_pv.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,8 +1247,8 @@ static void xen_flush_tlb_one_user(unsigned long addr)
12471247
preempt_enable();
12481248
}
12491249

1250-
static void xen_flush_tlb_others(const struct cpumask *cpus,
1251-
const struct flush_tlb_info *info)
1250+
static void xen_flush_tlb_multi(const struct cpumask *cpus,
1251+
const struct flush_tlb_info *info)
12521252
{
12531253
struct {
12541254
struct mmuext_op op;
@@ -1258,7 +1258,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
12581258
const size_t mc_entry_size = sizeof(args->op) +
12591259
sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
12601260

1261-
trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
1261+
trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
12621262

12631263
if (cpumask_empty(cpus))
12641264
return; /* nothing to do */
@@ -1267,9 +1267,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
12671267
args = mcs.args;
12681268
args->op.arg2.vcpumask = to_cpumask(args->mask);
12691269

1270-
/* Remove us, and any offline CPUS. */
1270+
/* Remove any offline CPUs */
12711271
cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1272-
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
12731272

12741273
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
12751274
if (info->end != TLB_FLUSH_ALL &&
@@ -2086,7 +2085,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
20862085
.flush_tlb_user = xen_flush_tlb,
20872086
.flush_tlb_kernel = xen_flush_tlb,
20882087
.flush_tlb_one_user = xen_flush_tlb_one_user,
2089-
.flush_tlb_others = xen_flush_tlb_others,
2088+
.flush_tlb_multi = xen_flush_tlb_multi,
20902089
.tlb_remove_table = tlb_remove_table,
20912090

20922091
.pgd_alloc = xen_pgd_alloc,

include/trace/events/xen.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ TRACE_EVENT(xen_mmu_flush_tlb_one_user,
346346
TP_printk("addr %lx", __entry->addr)
347347
);
348348

349-
TRACE_EVENT(xen_mmu_flush_tlb_others,
349+
TRACE_EVENT(xen_mmu_flush_tlb_multi,
350350
TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm,
351351
unsigned long addr, unsigned long end),
352352
TP_ARGS(cpus, mm, addr, end),

0 commit comments

Comments
 (0)