diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index fadec20b87a8e3..af3944cefd4efd 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -925,6 +925,18 @@ config RISCV_VECTOR_MISALIGNED
 	help
 	  Enable detecting support for vector misaligned loads and stores.
 
+config RISCV_LAZY_TLB_FLUSH
+	bool "Defer TLB Flush to context switch to avoid IPIs"
+	depends on MMU && SMP
+	def_bool n
+	help
+	  This feature avoids unnecessary TLB Flush IPIs. After memory mapping
+	  modifications on certain mm_struct, instead of sending IPIs, this feature
+	  records the TLB Flush information on percpu buffer, defer the TLB Flush
+	  to the moment when target CPUs really load this mm_struct.
+
+	  If unsure what to do here, say N.
+
 choice
 	prompt "Unaligned Accesses Support"
 	default RISCV_PROBE_UNALIGNED_ACCESS
diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index cf8e6eac77d520..913fa535b3d19b 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -30,6 +30,10 @@ typedef struct {
 #ifdef CONFIG_RISCV_ISA_SUPM
 	u8 pmlen;
 #endif
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+	atomic_t lazy_tlb_cnt;
+	void *next;
+#endif
 } mm_context_t;
 
 /* Lock the pointer masking mode because this mm is multithreaded */
diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h
index 8c4bc49a3a0f5b..bc73cc3262ae6c 100644
--- a/arch/riscv/include/asm/mmu_context.h
+++ b/arch/riscv/include/asm/mmu_context.h
@@ -16,6 +16,11 @@
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	struct task_struct *task);
 
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+#define arch_do_shoot_lazy_tlb	arch_do_shoot_lazy_tlb
+void arch_do_shoot_lazy_tlb(void *arg);
+#endif
+
 #define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev,
 			       struct mm_struct *next)
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index eed0abc4051436..fd62b27172d4a4 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -15,6 +15,11 @@
 #define FLUSH_TLB_NO_ASID       ((unsigned long)-1)
 
 #ifdef CONFIG_MMU
+static inline unsigned long get_mm_asid(struct mm_struct *mm)
+{
+	return mm ? cntx2asid(atomic_long_read(&mm->context.id)) : FLUSH_TLB_NO_ASID;
+}
+
 static inline void local_flush_tlb_all(void)
 {
 	__asm__ __volatile__ ("sfence.vma" : : : "memory");
@@ -66,6 +71,64 @@ void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 
 extern unsigned long tlb_flush_all_threshold;
+
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+
+#define MAX_LOADED_MM					6
+#define MAX_TLB_FLUSH_TASK				32
+#define FLUSH_TLB_ALL_ASID				0x1
+
+struct tlb_context {
+	struct mm_struct *mm;
+	unsigned int gen;
+	bool need_flush;
+};
+
+struct tlb_flush_task {
+	unsigned long start;
+	unsigned long size;
+	unsigned long stride;
+};
+
+struct tlb_flush_queue {
+	atomic_t len;
+	unsigned int flag;
+	struct tlb_flush_task tasks[MAX_TLB_FLUSH_TASK];
+} ____cacheline_aligned_in_smp;
+
+struct tlb_info {
+	rwlock_t rwlock;
+	struct mm_struct *active_mm;
+	unsigned int next_gen;
+	struct tlb_context contexts[MAX_LOADED_MM];
+	struct tlb_flush_queue *flush_queues;
+};
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo);
+
+void local_load_tlb_mm(struct mm_struct *mm);
+void local_flush_tlb_mm(struct mm_struct *mm);
+void local_flush_tlb_all_mm(void);
+void __init lazy_tlb_flush_init(void);
+
+#else /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
+static inline void local_load_tlb_mm(struct mm_struct *mm) {}
+
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	local_flush_tlb_all_asid(get_mm_asid(mm));
+}
+
+static inline void local_flush_tlb_all_mm(void)
+{
+	local_flush_tlb_all();
+}
+
+static inline void lazy_tlb_flush_init(void) {}
+
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
 #else /* CONFIG_MMU */
 #define local_flush_tlb_all()			do { } while (0)
 #endif /* CONFIG_MMU */
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 55c20ad1f74443..b6657681948f96 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -194,7 +194,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
 		  satp_mode);
 
 	if (need_flush_tlb)
-		local_flush_tlb_all();
+		local_flush_tlb_all_mm();
 }
 
 static void set_mm_noasid(struct mm_struct *mm)
@@ -217,6 +217,7 @@ static inline void set_mm(struct mm_struct *prev,
 	 */
 	cpumask_set_cpu(cpu, mm_cpumask(next));
 	if (static_branch_unlikely(&use_asid_allocator)) {
+		local_load_tlb_mm(next);
 		set_mm_asid(next, cpu);
 	} else {
 		cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -262,6 +263,8 @@ static int __init asids_init(void)
 
 		__set_bit(0, context_asid_map);
 
+		lazy_tlb_flush_init();
+
 		static_branch_enable(&use_asid_allocator);
 
 		pr_info("ASID allocator using %lu bits (%lu entries)\n",
@@ -273,6 +276,25 @@ static int __init asids_init(void)
 	return 0;
 }
 early_initcall(asids_init);
+
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+void arch_do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+
+	if (!static_branch_unlikely(&use_asid_allocator) || !mm)
+		return;
+
+	local_flush_tlb_mm(mm);
+}
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
 #else
 static inline void set_mm(struct mm_struct *prev,
 			  struct mm_struct *next, unsigned int cpu)
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 8404530ec00f93..73c0a7ef61cb1f 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -35,7 +35,8 @@ static inline void local_sinval_vma(unsigned long vma, unsigned long asid)
  */
 unsigned long tlb_flush_all_threshold __read_mostly = 64;
 
-static void local_flush_tlb_range_threshold_asid(unsigned long start,
+static void local_flush_tlb_range_threshold_asid(struct mm_struct *mm,
+						 unsigned long start,
 						 unsigned long size,
 						 unsigned long stride,
 						 unsigned long asid)
@@ -44,7 +45,7 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start,
 	int i;
 
 	if (nr_ptes_in_range > tlb_flush_all_threshold) {
-		local_flush_tlb_all_asid(asid);
+		local_flush_tlb_mm(mm);
 		return;
 	}
 
@@ -64,32 +65,37 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start,
 	}
 }
 
-static inline void local_flush_tlb_range_asid(unsigned long start,
-		unsigned long size, unsigned long stride, unsigned long asid)
+static inline void local_flush_tlb_range_asid(struct mm_struct *mm,
+					      unsigned long start,
+					      unsigned long size,
+					      unsigned long stride,
+					      unsigned long asid)
 {
-	if (size <= stride)
+	if (size <= stride) {
 		local_flush_tlb_page_asid(start, asid);
-	else if (size == FLUSH_TLB_MAX_SIZE)
-		local_flush_tlb_all_asid(asid);
-	else
-		local_flush_tlb_range_threshold_asid(start, size, stride, asid);
+	} else if (size == FLUSH_TLB_MAX_SIZE) {
+		local_flush_tlb_mm(mm);
+	} else {
+		local_flush_tlb_range_threshold_asid(mm, start, size, stride,
+						     asid);
+	}
 }
 
 /* Flush a range of kernel pages without broadcasting */
 void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	local_flush_tlb_range_asid(start, end - start, PAGE_SIZE, FLUSH_TLB_NO_ASID);
+	local_flush_tlb_range_asid(NULL, start, end - start, PAGE_SIZE, FLUSH_TLB_NO_ASID);
 }
 
 static void __ipi_flush_tlb_all(void *info)
 {
-	local_flush_tlb_all();
+	local_flush_tlb_all_mm();
 }
 
 void flush_tlb_all(void)
 {
 	if (num_online_cpus() < 2)
-		local_flush_tlb_all();
+		local_flush_tlb_all_mm();
 	else if (riscv_use_sbi_for_rfence())
 		sbi_remote_sfence_vma_asid(NULL, 0, FLUSH_TLB_MAX_SIZE, FLUSH_TLB_NO_ASID);
 	else
@@ -97,22 +103,114 @@ void flush_tlb_all(void)
 }
 
 struct flush_tlb_range_data {
+	struct mm_struct *mm;
 	unsigned long asid;
 	unsigned long start;
 	unsigned long size;
 	unsigned long stride;
 };
 
-static void __ipi_flush_tlb_range_asid(void *info)
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_info, tlbinfo) = {
+	.rwlock = __RW_LOCK_UNLOCKED(tlbinfo.rwlock),
+	.active_mm = NULL,
+	.next_gen = 1,
+	.contexts = { { NULL, 0, false, }, },
+	.next_gen = 0,
+};
+
+static DEFINE_PER_CPU(mm_context_t *, mmdrop_victims);
+
+static void mmdrop_lazy_mms(struct tasklet_struct *tasklet)
 {
-	struct flush_tlb_range_data *d = info;
+	mm_context_t *victim = xchg_relaxed(this_cpu_ptr(&mmdrop_victims), NULL);
+	struct mm_struct *mm = NULL;
+
+	while (victim) {
+		mm = container_of(victim, struct mm_struct, context);
+		while (atomic_dec_return_relaxed(&victim->lazy_tlb_cnt) != 0)
+			mmdrop_lazy_tlb(mm);
+		victim = victim->next;
+	}
+}
+
+static DEFINE_PER_CPU(struct tasklet_struct, mmdrop_tasklets) = {
+	.count = ATOMIC_INIT(0),
+	.callback = mmdrop_lazy_mms,
+	.use_callback = true,
+};
+
+static inline void mmgrab_lazy_mm(struct mm_struct *mm)
+{
+	mmgrab_lazy_tlb(mm);
+	atomic_inc(&mm->context.lazy_tlb_cnt);
+}
+
+static inline void mmdrop_lazy_mm(struct mm_struct *mm)
+{
+	mm_context_t **head, *list, *context = &mm->context;
+
+	if (atomic_inc_return_relaxed(&context->lazy_tlb_cnt) == 1) {
+		head = this_cpu_ptr(&mmdrop_victims);
+
+		do {
+			list = *head;
+			context->next = list;
+		} while (cmpxchg_relaxed(head, list, context) != list);
 
-	local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid);
+		tasklet_schedule(this_cpu_ptr(&mmdrop_tasklets));
+	}
 }
 
-static inline unsigned long get_mm_asid(struct mm_struct *mm)
+static bool should_ipi_flush(int cpu, void *data)
 {
-	return mm ? cntx2asid(atomic_long_read(&mm->context.id)) : FLUSH_TLB_NO_ASID;
+	struct tlb_info *info = per_cpu_ptr(&tlbinfo, cpu);
+	struct tlb_context *contexts = info->contexts;
+	struct tlb_flush_queue *queue = NULL;
+	struct flush_tlb_range_data *ftd = data;
+	unsigned int i, index;
+	unsigned long flags;
+
+	if (info->active_mm == ftd->mm)
+		return true;
+
+	read_lock_irqsave(&info->rwlock, flags);
+
+	if (info->active_mm == ftd->mm) {
+		read_unlock_irqrestore(&info->rwlock, flags);
+		return true;
+	}
+
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (contexts[i].mm != ftd->mm)
+			continue;
+
+		queue = &info->flush_queues[i];
+		index = atomic_fetch_add_unless(&queue->len, 1, MAX_TLB_FLUSH_TASK);
+		if (index < MAX_TLB_FLUSH_TASK) {
+			queue->tasks[index].start = ftd->start;
+			queue->tasks[index].stride = ftd->stride;
+			queue->tasks[index].size = ftd->size;
+		} else {
+			queue->flag |= FLUSH_TLB_ALL_ASID;
+		}
+		contexts[i].need_flush = true;
+		break;
+	}
+
+	read_unlock_irqrestore(&info->rwlock, flags);
+
+	return false;
+}
+
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
+
+static void __ipi_flush_tlb_range_asid(void *info)
+{
+	struct flush_tlb_range_data *d = info;
+
+	local_flush_tlb_range_asid(d->mm, d->start, d->size, d->stride, d->asid);
 }
 
 static void __flush_tlb_range(struct mm_struct *mm,
@@ -130,17 +228,26 @@ static void __flush_tlb_range(struct mm_struct *mm,
 
 	/* Check if the TLB flush needs to be sent to other CPUs. */
 	if (cpumask_any_but(cmask, cpu) >= nr_cpu_ids) {
-		local_flush_tlb_range_asid(start, size, stride, asid);
+		local_flush_tlb_range_asid(mm, start, size, stride, asid);
 	} else if (riscv_use_sbi_for_rfence()) {
 		sbi_remote_sfence_vma_asid(cmask, start, size, asid);
 	} else {
 		struct flush_tlb_range_data ftd;
 
+		ftd.mm = mm;
 		ftd.asid = asid;
 		ftd.start = start;
 		ftd.size = size;
 		ftd.stride = stride;
-		on_each_cpu_mask(cmask, __ipi_flush_tlb_range_asid, &ftd, 1);
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+		if (static_branch_unlikely(&use_asid_allocator) && mm)
+			on_each_cpu_cond_mask(should_ipi_flush,
+					      __ipi_flush_tlb_range_asid,
+					      &ftd, 1, cmask);
+		else
+#endif
+			on_each_cpu_mask(cmask, __ipi_flush_tlb_range_asid,
+					 &ftd, 1);
 	}
 
 	put_cpu();
@@ -240,3 +347,160 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 			  0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
 	cpumask_clear(&batch->cpumask);
 }
+
+#ifdef CONFIG_RISCV_LAZY_TLB_FLUSH
+
+static inline unsigned int new_tlb_gen(struct tlb_info *info)
+{
+	unsigned int gen = info->next_gen++;
+	unsigned int i;
+
+	if (unlikely(!info->next_gen)) {
+		for (i = 0; i < MAX_LOADED_MM; i++) {
+			if (info->contexts[i].gen)
+				info->contexts[i].gen = 1;
+		}
+		info->next_gen = 1;
+		gen = info->next_gen++;
+	}
+
+	return gen;
+}
+
+void local_load_tlb_mm(struct mm_struct *mm)
+{
+	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
+	struct tlb_context *contexts = info->contexts;
+	struct tlb_flush_queue *queue = NULL;
+	struct mm_struct *victim = NULL;
+	unsigned int i, len, pos = 0, min = UINT_MAX;
+	unsigned long asid, start, size, stride;
+
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (contexts[i].mm == mm) {
+			pos = i;
+			break;
+		}
+		if (min > contexts[i].gen) {
+			min = contexts[i].gen;
+			pos = i;
+		}
+	}
+
+	write_lock(&info->rwlock);
+
+	info->active_mm = mm;
+
+	if (contexts[pos].mm != mm) {
+		mmgrab_lazy_mm(mm);
+		victim = contexts[pos].mm;
+		contexts[pos].mm = mm;
+		contexts[pos].need_flush = false;
+
+		queue = &info->flush_queues[pos];
+		atomic_set(&queue->len, 0);
+		queue->flag = 0;
+	}
+	contexts[pos].gen = new_tlb_gen(info);
+
+	write_unlock(&info->rwlock);
+
+	if (contexts[pos].need_flush) {
+		queue = &info->flush_queues[pos];
+		asid = get_mm_asid(mm);
+		if (queue->flag & FLUSH_TLB_ALL_ASID) {
+			local_flush_tlb_all_asid(asid);
+		} else {
+			len = atomic_read(&queue->len);
+			for (i = 0; i < len; i++) {
+				start = queue->tasks[i].start;
+				size = queue->tasks[i].size;
+				stride = queue->tasks[i].stride;
+				local_flush_tlb_range_asid(mm, start, size,
+							   stride, asid);
+			}
+		}
+		contexts[pos].need_flush = false;
+		atomic_set(&queue->len, 0);
+		queue->flag = 0;
+	}
+
+	if (victim) {
+		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(victim));
+		local_flush_tlb_all_asid(get_mm_asid(victim));
+		mmdrop_lazy_mm(victim);
+	}
+}
+
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
+	struct tlb_context *contexts = info->contexts;
+	unsigned long asid = get_mm_asid(mm);
+	unsigned int i;
+
+	if (!mm || mm == info->active_mm) {
+		local_flush_tlb_all_asid(asid);
+		return;
+	}
+
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (contexts[i].mm != mm)
+			continue;
+
+		write_lock(&info->rwlock);
+		contexts[i].mm = NULL;
+		contexts[i].gen = 0;
+		write_unlock(&info->rwlock);
+
+		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(mm));
+		mmdrop_lazy_mm(mm);
+		break;
+	}
+
+	local_flush_tlb_all_asid(asid);
+}
+
+void local_flush_tlb_all_mm(void)
+{
+	struct tlb_info *info = this_cpu_ptr(&tlbinfo);
+	struct tlb_context *contexts = info->contexts;
+	struct mm_struct *mms[MAX_LOADED_MM];
+	unsigned int cpu = raw_smp_processor_id();
+	unsigned int i, num = 0;
+
+	write_lock(&info->rwlock);
+	for (i = 0; i < MAX_LOADED_MM; i++) {
+		if (!contexts[i].mm || contexts[i].mm == info->active_mm)
+			continue;
+
+		mms[num++] = contexts[i].mm;
+		contexts[i].mm = NULL;
+		contexts[i].gen = 0;
+	}
+	write_unlock(&info->rwlock);
+
+	for (i = 0; i < num; i++) {
+		cpumask_clear_cpu(cpu, mm_cpumask(mms[i]));
+		mmdrop_lazy_mm(mms[i]);
+	}
+
+	local_flush_tlb_all();
+}
+
+void __init lazy_tlb_flush_init(void)
+{
+	struct tlb_flush_queue *queue;
+	unsigned int cpu, size;
+
+	size = MAX_LOADED_MM * sizeof(struct tlb_flush_queue);
+	for_each_possible_cpu(cpu) {
+		queue = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+		if (!queue)
+			panic("Failed to alloc per cpu tlb flush queue\n");
+
+		per_cpu(tlbinfo, cpu).flush_queues = queue;
+	}
+}
+
+#endif /* CONFIG_RISCV_LAZY_TLB_FLUSH */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a95e..b6d11acd6ac10b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -612,7 +612,8 @@ static void do_check_lazy_tlb(void *arg)
 	WARN_ON_ONCE(current->active_mm == mm);
 }
 
-static void do_shoot_lazy_tlb(void *arg)
+#ifndef arch_do_shoot_lazy_tlb
+static void arch_do_shoot_lazy_tlb(void *arg)
 {
 	struct mm_struct *mm = arg;
 
@@ -622,6 +623,7 @@ static void do_shoot_lazy_tlb(void *arg)
 		switch_mm(mm, &init_mm, current);
 	}
 }
+#endif
 
 static void cleanup_lazy_tlbs(struct mm_struct *mm)
 {
@@ -661,7 +663,7 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
 	 * - A delayed freeing and RCU-like quiescing sequence based on mm
 	 *   switching to avoid IPIs completely.
 	 */
-	on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+	on_each_cpu_mask(mm_cpumask(mm), arch_do_shoot_lazy_tlb, (void *)mm, 1);
 	if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
 		on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
 }