diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index fadec20b87a8e3..af3944cefd4efd 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -925,6 +925,18 @@ config RISCV_VECTOR_MISALIGNED help Enable detecting support for vector misaligned loads and stores. +config RISCV_LAZY_TLB_FLUSH + bool "Defer TLB Flush to context switch to avoid IPIs" + depends on MMU && SMP + def_bool n + help + This feature avoids unnecessary TLB Flush IPIs. After memory mapping + modifications on certain mm_struct, instead of sending IPIs, this feature + records the TLB Flush information on percpu buffer, defer the TLB Flush + to the moment when target CPUs really load this mm_struct. + + If unsure what to do here, say N. + choice prompt "Unaligned Accesses Support" default RISCV_PROBE_UNALIGNED_ACCESS diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index cf8e6eac77d520..913fa535b3d19b 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -30,6 +30,10 @@ typedef struct { #ifdef CONFIG_RISCV_ISA_SUPM u8 pmlen; #endif +#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH + atomic_t lazy_tlb_cnt; + void *next; +#endif } mm_context_t; /* Lock the pointer masking mode because this mm is multithreaded */ diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h index 8c4bc49a3a0f5b..bc73cc3262ae6c 100644 --- a/arch/riscv/include/asm/mmu_context.h +++ b/arch/riscv/include/asm/mmu_context.h @@ -16,6 +16,11 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *task); +#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH +#define arch_do_shoot_lazy_tlb arch_do_shoot_lazy_tlb +void arch_do_shoot_lazy_tlb(void *arg); +#endif + #define activate_mm activate_mm static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index eed0abc4051436..fd62b27172d4a4 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -15,6 +15,11 @@ #define FLUSH_TLB_NO_ASID ((unsigned long)-1) #ifdef CONFIG_MMU +static inline unsigned long get_mm_asid(struct mm_struct *mm) +{ + return mm ? cntx2asid(atomic_long_read(&mm->context.id)) : FLUSH_TLB_NO_ASID; +} + static inline void local_flush_tlb_all(void) { __asm__ __volatile__ ("sfence.vma" : : : "memory"); @@ -66,6 +71,64 @@ void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); extern unsigned long tlb_flush_all_threshold; + +#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH + +#define MAX_LOADED_MM 6 +#define MAX_TLB_FLUSH_TASK 32 +#define FLUSH_TLB_ALL_ASID 0x1 + +struct tlb_context { + struct mm_struct *mm; + unsigned int gen; + bool need_flush; +}; + +struct tlb_flush_task { + unsigned long start; + unsigned long size; + unsigned long stride; +}; + +struct tlb_flush_queue { + atomic_t len; + unsigned int flag; + struct tlb_flush_task tasks[MAX_TLB_FLUSH_TASK]; +} ____cacheline_aligned_in_smp; + +struct tlb_info { + rwlock_t rwlock; + struct mm_struct *active_mm; + unsigned int next_gen; + struct tlb_context contexts[MAX_LOADED_MM]; + struct tlb_flush_queue *flush_queues; +}; + +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo); + +void local_load_tlb_mm(struct mm_struct *mm); +void local_flush_tlb_mm(struct mm_struct *mm); +void local_flush_tlb_all_mm(void); +void __init lazy_tlb_flush_init(void); + +#else /* CONFIG_RISCV_LAZY_TLB_FLUSH */ + +static inline void local_load_tlb_mm(struct mm_struct *mm) {} + +static inline void local_flush_tlb_mm(struct mm_struct *mm) +{ + local_flush_tlb_all_asid(get_mm_asid(mm)); +} + +static inline void local_flush_tlb_all_mm(void) +{ + local_flush_tlb_all(); +} + +static inline void lazy_tlb_flush_init(void) {} + +#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */ + #else /* CONFIG_MMU */ #define local_flush_tlb_all() do { } while (0) #endif /* CONFIG_MMU */ diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index 55c20ad1f74443..b6657681948f96 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -194,7 +194,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) satp_mode); if (need_flush_tlb) - local_flush_tlb_all(); + local_flush_tlb_all_mm(); } static void set_mm_noasid(struct mm_struct *mm) @@ -217,6 +217,7 @@ static inline void set_mm(struct mm_struct *prev, */ cpumask_set_cpu(cpu, mm_cpumask(next)); if (static_branch_unlikely(&use_asid_allocator)) { + local_load_tlb_mm(next); set_mm_asid(next, cpu); } else { cpumask_clear_cpu(cpu, mm_cpumask(prev)); @@ -262,6 +263,8 @@ static int __init asids_init(void) __set_bit(0, context_asid_map); + lazy_tlb_flush_init(); + static_branch_enable(&use_asid_allocator); pr_info("ASID allocator using %lu bits (%lu entries)\n", @@ -273,6 +276,25 @@ static int __init asids_init(void) return 0; } early_initcall(asids_init); + +#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH +void arch_do_shoot_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + WARN_ON_ONCE(current->mm); + current->active_mm = &init_mm; + switch_mm(mm, &init_mm, current); + } + + if (!static_branch_unlikely(&use_asid_allocator) || !mm) + return; + + local_flush_tlb_mm(mm); +} +#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */ + #else static inline void set_mm(struct mm_struct *prev, struct mm_struct *next, unsigned int cpu) diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 8404530ec00f93..73c0a7ef61cb1f 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -35,7 +35,8 @@ static inline void local_sinval_vma(unsigned long vma, unsigned long asid) */ unsigned long tlb_flush_all_threshold __read_mostly = 64; -static void local_flush_tlb_range_threshold_asid(unsigned long start, +static void local_flush_tlb_range_threshold_asid(struct mm_struct *mm, + unsigned long start, unsigned long size, unsigned long stride, unsigned long asid) @@ -44,7 +45,7 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start, int i; if (nr_ptes_in_range > tlb_flush_all_threshold) { - local_flush_tlb_all_asid(asid); + local_flush_tlb_mm(mm); return; } @@ -64,32 +65,37 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start, } } -static inline void local_flush_tlb_range_asid(unsigned long start, - unsigned long size, unsigned long stride, unsigned long asid) +static inline void local_flush_tlb_range_asid(struct mm_struct *mm, + unsigned long start, + unsigned long size, + unsigned long stride, + unsigned long asid) { - if (size <= stride) + if (size <= stride) { local_flush_tlb_page_asid(start, asid); - else if (size == FLUSH_TLB_MAX_SIZE) - local_flush_tlb_all_asid(asid); - else - local_flush_tlb_range_threshold_asid(start, size, stride, asid); + } else if (size == FLUSH_TLB_MAX_SIZE) { + local_flush_tlb_mm(mm); + } else { + local_flush_tlb_range_threshold_asid(mm, start, size, stride, + asid); + } } /* Flush a range of kernel pages without broadcasting */ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end) { - local_flush_tlb_range_asid(start, end - start, PAGE_SIZE, FLUSH_TLB_NO_ASID); + local_flush_tlb_range_asid(NULL, start, end - start, PAGE_SIZE, FLUSH_TLB_NO_ASID); } static void __ipi_flush_tlb_all(void *info) { - local_flush_tlb_all(); + local_flush_tlb_all_mm(); } void flush_tlb_all(void) { if (num_online_cpus() < 2) - local_flush_tlb_all(); + local_flush_tlb_all_mm(); else if (riscv_use_sbi_for_rfence()) sbi_remote_sfence_vma_asid(NULL, 0, FLUSH_TLB_MAX_SIZE, FLUSH_TLB_NO_ASID); else @@ -97,22 +103,114 @@ void flush_tlb_all(void) } struct flush_tlb_range_data { + struct mm_struct *mm; unsigned long asid; unsigned long start; unsigned long size; unsigned long stride; }; -static void __ipi_flush_tlb_range_asid(void *info) +#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH + +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo) = { + .rwlock = __RW_LOCK_UNLOCKED(tlbinfo.rwlock), + .active_mm = NULL, + .next_gen = 1, + .contexts = { { NULL, 0, false, }, }, + .next_gen = 0, +}; + +static DEFINE_PER_CPU(mm_context_t *, mmdrop_victims); + +static void mmdrop_lazy_mms(struct tasklet_struct *tasklet) { - struct flush_tlb_range_data *d = info; + mm_context_t *victim = xchg_relaxed(this_cpu_ptr(&mmdrop_victims), NULL); + struct mm_struct *mm = NULL; + + while (victim) { + mm = container_of(victim, struct mm_struct, context); + while (atomic_dec_return_relaxed(&victim->lazy_tlb_cnt) != 0) + mmdrop_lazy_tlb(mm); + victim = victim->next; + } +} + +static DEFINE_PER_CPU(struct tasklet_struct, mmdrop_tasklets) = { + .count = ATOMIC_INIT(0), + .callback = mmdrop_lazy_mms, + .use_callback = true, +}; + +static inline void mmgrab_lazy_mm(struct mm_struct *mm) +{ + mmgrab_lazy_tlb(mm); + atomic_inc(&mm->context.lazy_tlb_cnt); +} + +static inline void mmdrop_lazy_mm(struct mm_struct *mm) +{ + mm_context_t **head, *list, *context = &mm->context; + + if (atomic_inc_return_relaxed(&context->lazy_tlb_cnt) == 1) { + head = this_cpu_ptr(&mmdrop_victims); + + do { + list = *head; + context->next = list; + } while (cmpxchg_relaxed(head, list, context) != list); - local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid); + tasklet_schedule(this_cpu_ptr(&mmdrop_tasklets)); + } } -static inline unsigned long get_mm_asid(struct mm_struct *mm) +static bool should_ipi_flush(int cpu, void *data) { - return mm ? cntx2asid(atomic_long_read(&mm->context.id)) : FLUSH_TLB_NO_ASID; + struct tlb_info *info = per_cpu_ptr(&tlbinfo, cpu); + struct tlb_context *contexts = info->contexts; + struct tlb_flush_queue *queue = NULL; + struct flush_tlb_range_data *ftd = data; + unsigned int i, index; + unsigned long flags; + + if (info->active_mm == ftd->mm) + return true; + + read_lock_irqsave(&info->rwlock, flags); + + if (info->active_mm == ftd->mm) { + read_unlock_irqrestore(&info->rwlock, flags); + return true; + } + + for (i = 0; i < MAX_LOADED_MM; i++) { + if (contexts[i].mm != ftd->mm) + continue; + + queue = &info->flush_queues[i]; + index = atomic_fetch_add_unless(&queue->len, 1, MAX_TLB_FLUSH_TASK); + if (index < MAX_TLB_FLUSH_TASK) { + queue->tasks[index].start = ftd->start; + queue->tasks[index].stride = ftd->stride; + queue->tasks[index].size = ftd->size; + } else { + queue->flag |= FLUSH_TLB_ALL_ASID; + } + contexts[i].need_flush = true; + break; + } + + read_unlock_irqrestore(&info->rwlock, flags); + + return false; +} + +#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */ + +static void __ipi_flush_tlb_range_asid(void *info) +{ + struct flush_tlb_range_data *d = info; + + local_flush_tlb_range_asid(d->mm, d->start, d->size, d->stride, d->asid); } static void __flush_tlb_range(struct mm_struct *mm, @@ -130,17 +228,26 @@ static void __flush_tlb_range(struct mm_struct *mm, /* Check if the TLB flush needs to be sent to other CPUs. */ if (cpumask_any_but(cmask, cpu) >= nr_cpu_ids) { - local_flush_tlb_range_asid(start, size, stride, asid); + local_flush_tlb_range_asid(mm, start, size, stride, asid); } else if (riscv_use_sbi_for_rfence()) { sbi_remote_sfence_vma_asid(cmask, start, size, asid); } else { struct flush_tlb_range_data ftd; + ftd.mm = mm; ftd.asid = asid; ftd.start = start; ftd.size = size; ftd.stride = stride; - on_each_cpu_mask(cmask, __ipi_flush_tlb_range_asid, &ftd, 1); +#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH + if (static_branch_unlikely(&use_asid_allocator) && mm) + on_each_cpu_cond_mask(should_ipi_flush, + __ipi_flush_tlb_range_asid, + &ftd, 1, cmask); + else +#endif + on_each_cpu_mask(cmask, __ipi_flush_tlb_range_asid, + &ftd, 1); } put_cpu(); @@ -240,3 +347,160 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE); cpumask_clear(&batch->cpumask); } + +#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH + +static inline unsigned int new_tlb_gen(struct tlb_info *info) +{ + unsigned int gen = info->next_gen++; + unsigned int i; + + if (unlikely(!info->next_gen)) { + for (i = 0; i < MAX_LOADED_MM; i++) { + if (info->contexts[i].gen) + info->contexts[i].gen = 1; + } + info->next_gen = 1; + gen = info->next_gen++; + } + + return gen; +} + +void local_load_tlb_mm(struct mm_struct *mm) +{ + struct tlb_info *info = this_cpu_ptr(&tlbinfo); + struct tlb_context *contexts = info->contexts; + struct tlb_flush_queue *queue = NULL; + struct mm_struct *victim = NULL; + unsigned int i, len, pos = 0, min = UINT_MAX; + unsigned long asid, start, size, stride; + + for (i = 0; i < MAX_LOADED_MM; i++) { + if (contexts[i].mm == mm) { + pos = i; + break; + } + if (min > contexts[i].gen) { + min = contexts[i].gen; + pos = i; + } + } + + write_lock(&info->rwlock); + + info->active_mm = mm; + + if (contexts[pos].mm != mm) { + mmgrab_lazy_mm(mm); + victim = contexts[pos].mm; + contexts[pos].mm = mm; + contexts[pos].need_flush = false; + + queue = &info->flush_queues[pos]; + atomic_set(&queue->len, 0); + queue->flag = 0; + } + contexts[pos].gen = new_tlb_gen(info); + + write_unlock(&info->rwlock); + + if (contexts[pos].need_flush) { + queue = &info->flush_queues[pos]; + asid = get_mm_asid(mm); + if (queue->flag & FLUSH_TLB_ALL_ASID) { + local_flush_tlb_all_asid(asid); + } else { + len = atomic_read(&queue->len); + for (i = 0; i < len; i++) { + start = queue->tasks[i].start; + size = queue->tasks[i].size; + stride = queue->tasks[i].stride; + local_flush_tlb_range_asid(mm, start, size, + stride, asid); + } + } + contexts[pos].need_flush = false; + atomic_set(&queue->len, 0); + queue->flag = 0; + } + + if (victim) { + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(victim)); + local_flush_tlb_all_asid(get_mm_asid(victim)); + mmdrop_lazy_mm(victim); + } +} + +void local_flush_tlb_mm(struct mm_struct *mm) +{ + struct tlb_info *info = this_cpu_ptr(&tlbinfo); + struct tlb_context *contexts = info->contexts; + unsigned long asid = get_mm_asid(mm); + unsigned int i; + + if (!mm || mm == info->active_mm) { + local_flush_tlb_all_asid(asid); + return; + } + + for (i = 0; i < MAX_LOADED_MM; i++) { + if (contexts[i].mm != mm) + continue; + + write_lock(&info->rwlock); + contexts[i].mm = NULL; + contexts[i].gen = 0; + write_unlock(&info->rwlock); + + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(mm)); + mmdrop_lazy_mm(mm); + break; + } + + local_flush_tlb_all_asid(asid); +} + +void local_flush_tlb_all_mm(void) +{ + struct tlb_info *info = this_cpu_ptr(&tlbinfo); + struct tlb_context *contexts = info->contexts; + struct mm_struct *mms[MAX_LOADED_MM]; + unsigned int cpu = raw_smp_processor_id(); + unsigned int i, num = 0; + + write_lock(&info->rwlock); + for (i = 0; i < MAX_LOADED_MM; i++) { + if (!contexts[i].mm || contexts[i].mm == info->active_mm) + continue; + + mms[num++] = contexts[i].mm; + contexts[i].mm = NULL; + contexts[i].gen = 0; + } + write_unlock(&info->rwlock); + + for (i = 0; i < num; i++) { + cpumask_clear_cpu(cpu, mm_cpumask(mms[i])); + mmdrop_lazy_mm(mms[i]); + } + + local_flush_tlb_all(); +} + +void __init lazy_tlb_flush_init(void) +{ + struct tlb_flush_queue *queue; + unsigned int cpu, size; + + size = MAX_LOADED_MM * sizeof(struct tlb_flush_queue); + for_each_possible_cpu(cpu) { + queue = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (!queue) + panic("Failed to alloc per cpu tlb flush queue\n"); + + per_cpu(tlbinfo, cpu).flush_queues = queue; + } +} + +#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */ diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a95e..b6d11acd6ac10b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -612,7 +612,8 @@ static void do_check_lazy_tlb(void *arg) WARN_ON_ONCE(current->active_mm == mm); } -static void do_shoot_lazy_tlb(void *arg) +#ifndef arch_do_shoot_lazy_tlb +static void arch_do_shoot_lazy_tlb(void *arg) { struct mm_struct *mm = arg; @@ -622,6 +623,7 @@ static void do_shoot_lazy_tlb(void *arg) switch_mm(mm, &init_mm, current); } } +#endif static void cleanup_lazy_tlbs(struct mm_struct *mm) { @@ -661,7 +663,7 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm) * - A delayed freeing and RCU-like quiescing sequence based on mm * switching to avoid IPIs completely. */ - on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); + on_each_cpu_mask(mm_cpumask(mm), arch_do_shoot_lazy_tlb, (void *)mm, 1); if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES)) on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); }