@@ -3989,6 +3989,194 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
3989
3989
return VM_FAULT_SIGBUS ;
3990
3990
}
3991
3991
3992
+ static struct folio * __alloc_swap_folio (struct vm_fault * vmf )
3993
+ {
3994
+ struct vm_area_struct * vma = vmf -> vma ;
3995
+ struct folio * folio ;
3996
+ swp_entry_t entry ;
3997
+
3998
+ folio = vma_alloc_folio (GFP_HIGHUSER_MOVABLE , 0 , vma ,
3999
+ vmf -> address , false);
4000
+ if (!folio )
4001
+ return NULL ;
4002
+
4003
+ entry = pte_to_swp_entry (vmf -> orig_pte );
4004
+ if (mem_cgroup_swapin_charge_folio (folio , vma -> vm_mm ,
4005
+ GFP_KERNEL , entry )) {
4006
+ folio_put (folio );
4007
+ return NULL ;
4008
+ }
4009
+
4010
+ return folio ;
4011
+ }
4012
+
4013
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4014
+ static inline int non_swapcache_batch (swp_entry_t entry , int max_nr )
4015
+ {
4016
+ struct swap_info_struct * si = swp_swap_info (entry );
4017
+ pgoff_t offset = swp_offset (entry );
4018
+ int i ;
4019
+
4020
+ /*
4021
+ * While allocating a large folio and doing swap_read_folio, which is
4022
+ * the case the being faulted pte doesn't have swapcache. We need to
4023
+ * ensure all PTEs have no cache as well, otherwise, we might go to
4024
+ * swap devices while the content is in swapcache.
4025
+ */
4026
+ for (i = 0 ; i < max_nr ; i ++ ) {
4027
+ if ((si -> swap_map [offset + i ] & SWAP_HAS_CACHE ))
4028
+ return i ;
4029
+ }
4030
+
4031
+ return i ;
4032
+ }
4033
+
4034
+ /*
4035
+ * Check if the PTEs within a range are contiguous swap entries
4036
+ * and have consistent swapcache, zeromap.
4037
+ */
4038
+ static bool can_swapin_thp (struct vm_fault * vmf , pte_t * ptep , int nr_pages )
4039
+ {
4040
+ unsigned long addr ;
4041
+ swp_entry_t entry ;
4042
+ int idx ;
4043
+ pte_t pte ;
4044
+
4045
+ addr = ALIGN_DOWN (vmf -> address , nr_pages * PAGE_SIZE );
4046
+ idx = (vmf -> address - addr ) / PAGE_SIZE ;
4047
+ pte = ptep_get (ptep );
4048
+
4049
+ if (!pte_same (pte , pte_move_swp_offset (vmf -> orig_pte , - idx )))
4050
+ return false;
4051
+ entry = pte_to_swp_entry (pte );
4052
+ if (swap_pte_batch (ptep , nr_pages , pte ) != nr_pages )
4053
+ return false;
4054
+
4055
+ /*
4056
+ * swap_read_folio() can't handle the case a large folio is hybridly
4057
+ * from different backends. And they are likely corner cases. Similar
4058
+ * things might be added once zswap support large folios.
4059
+ */
4060
+ if (unlikely (swap_zeromap_batch (entry , nr_pages , NULL ) != nr_pages ))
4061
+ return false;
4062
+ if (unlikely (non_swapcache_batch (entry , nr_pages ) != nr_pages ))
4063
+ return false;
4064
+
4065
+ return true;
4066
+ }
4067
+
4068
+ static inline unsigned long thp_swap_suitable_orders (pgoff_t swp_offset ,
4069
+ unsigned long addr ,
4070
+ unsigned long orders )
4071
+ {
4072
+ int order , nr ;
4073
+
4074
+ order = highest_order (orders );
4075
+
4076
+ /*
4077
+ * To swap in a THP with nr pages, we require that its first swap_offset
4078
+ * is aligned with that number, as it was when the THP was swapped out.
4079
+ * This helps filter out most invalid entries.
4080
+ */
4081
+ while (orders ) {
4082
+ nr = 1 << order ;
4083
+ if ((addr >> PAGE_SHIFT ) % nr == swp_offset % nr )
4084
+ break ;
4085
+ order = next_order (& orders , order );
4086
+ }
4087
+
4088
+ return orders ;
4089
+ }
4090
+
4091
+ static struct folio * alloc_swap_folio (struct vm_fault * vmf )
4092
+ {
4093
+ struct vm_area_struct * vma = vmf -> vma ;
4094
+ unsigned long orders ;
4095
+ struct folio * folio ;
4096
+ unsigned long addr ;
4097
+ swp_entry_t entry ;
4098
+ spinlock_t * ptl ;
4099
+ pte_t * pte ;
4100
+ gfp_t gfp ;
4101
+ int order ;
4102
+
4103
+ /*
4104
+ * If uffd is active for the vma we need per-page fault fidelity to
4105
+ * maintain the uffd semantics.
4106
+ */
4107
+ if (unlikely (userfaultfd_armed (vma )))
4108
+ goto fallback ;
4109
+
4110
+ /*
4111
+ * A large swapped out folio could be partially or fully in zswap. We
4112
+ * lack handling for such cases, so fallback to swapping in order-0
4113
+ * folio.
4114
+ */
4115
+ if (!zswap_never_enabled ())
4116
+ goto fallback ;
4117
+
4118
+ entry = pte_to_swp_entry (vmf -> orig_pte );
4119
+ /*
4120
+ * Get a list of all the (large) orders below PMD_ORDER that are enabled
4121
+ * and suitable for swapping THP.
4122
+ */
4123
+ orders = thp_vma_allowable_orders (vma , vma -> vm_flags ,
4124
+ TVA_IN_PF | TVA_ENFORCE_SYSFS , BIT (PMD_ORDER ) - 1 );
4125
+ orders = thp_vma_suitable_orders (vma , vmf -> address , orders );
4126
+ orders = thp_swap_suitable_orders (swp_offset (entry ),
4127
+ vmf -> address , orders );
4128
+
4129
+ if (!orders )
4130
+ goto fallback ;
4131
+
4132
+ pte = pte_offset_map_lock (vmf -> vma -> vm_mm , vmf -> pmd ,
4133
+ vmf -> address & PMD_MASK , & ptl );
4134
+ if (unlikely (!pte ))
4135
+ goto fallback ;
4136
+
4137
+ /*
4138
+ * For do_swap_page, find the highest order where the aligned range is
4139
+ * completely swap entries with contiguous swap offsets.
4140
+ */
4141
+ order = highest_order (orders );
4142
+ while (orders ) {
4143
+ addr = ALIGN_DOWN (vmf -> address , PAGE_SIZE << order );
4144
+ if (can_swapin_thp (vmf , pte + pte_index (addr ), 1 << order ))
4145
+ break ;
4146
+ order = next_order (& orders , order );
4147
+ }
4148
+
4149
+ pte_unmap_unlock (pte , ptl );
4150
+
4151
+ /* Try allocating the highest of the remaining orders. */
4152
+ gfp = vma_thp_gfp_mask (vma );
4153
+ while (orders ) {
4154
+ addr = ALIGN_DOWN (vmf -> address , PAGE_SIZE << order );
4155
+ folio = vma_alloc_folio (gfp , order , vma , addr , true);
4156
+ if (folio ) {
4157
+ if (!mem_cgroup_swapin_charge_folio (folio , vma -> vm_mm ,
4158
+ gfp , entry ))
4159
+ return folio ;
4160
+ folio_put (folio );
4161
+ }
4162
+ order = next_order (& orders , order );
4163
+ }
4164
+
4165
+ fallback :
4166
+ return __alloc_swap_folio (vmf );
4167
+ }
4168
+ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
4169
+ static inline bool can_swapin_thp (struct vm_fault * vmf , pte_t * ptep , int nr_pages )
4170
+ {
4171
+ return false;
4172
+ }
4173
+
4174
+ static struct folio * alloc_swap_folio (struct vm_fault * vmf )
4175
+ {
4176
+ return __alloc_swap_folio (vmf );
4177
+ }
4178
+ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4179
+
3992
4180
/*
3993
4181
* We enter with non-exclusive mmap_lock (to exclude vma changes,
3994
4182
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4077,34 +4265,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
4077
4265
if (!folio ) {
4078
4266
if (data_race (si -> flags & SWP_SYNCHRONOUS_IO ) &&
4079
4267
__swap_count (entry ) == 1 ) {
4080
- /*
4081
- * Prevent parallel swapin from proceeding with
4082
- * the cache flag. Otherwise, another thread may
4083
- * finish swapin first, free the entry, and swapout
4084
- * reusing the same entry. It's undetectable as
4085
- * pte_same() returns true due to entry reuse.
4086
- */
4087
- if (swapcache_prepare (entry , 1 )) {
4088
- /* Relax a bit to prevent rapid repeated page faults */
4089
- schedule_timeout_uninterruptible (1 );
4090
- goto out ;
4091
- }
4092
- need_clear_cache = true;
4093
-
4094
4268
/* skip swapcache */
4095
- folio = vma_alloc_folio (GFP_HIGHUSER_MOVABLE , 0 ,
4096
- vma , vmf -> address , false);
4269
+ folio = alloc_swap_folio (vmf );
4097
4270
if (folio ) {
4098
4271
__folio_set_locked (folio );
4099
4272
__folio_set_swapbacked (folio );
4100
4273
4101
- if (mem_cgroup_swapin_charge_folio (folio ,
4102
- vma -> vm_mm , GFP_KERNEL ,
4103
- entry )) {
4104
- ret = VM_FAULT_OOM ;
4274
+ nr_pages = folio_nr_pages (folio );
4275
+ if (folio_test_large (folio ))
4276
+ entry .val = ALIGN_DOWN (entry .val , nr_pages );
4277
+ /*
4278
+ * Prevent parallel swapin from proceeding with
4279
+ * the cache flag. Otherwise, another thread
4280
+ * may finish swapin first, free the entry, and
4281
+ * swapout reusing the same entry. It's
4282
+ * undetectable as pte_same() returns true due
4283
+ * to entry reuse.
4284
+ */
4285
+ if (swapcache_prepare (entry , nr_pages )) {
4286
+ /*
4287
+ * Relax a bit to prevent rapid
4288
+ * repeated page faults.
4289
+ */
4290
+ schedule_timeout_uninterruptible (1 );
4105
4291
goto out_page ;
4106
4292
}
4107
- mem_cgroup_swapin_uncharge_swap (entry , 1 );
4293
+ need_clear_cache = true;
4294
+
4295
+ mem_cgroup_swapin_uncharge_swap (entry , nr_pages );
4108
4296
4109
4297
shadow = get_shadow_from_swap_cache (entry );
4110
4298
if (shadow )
@@ -4210,6 +4398,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
4210
4398
goto out_nomap ;
4211
4399
}
4212
4400
4401
+ /* allocated large folios for SWP_SYNCHRONOUS_IO */
4402
+ if (folio_test_large (folio ) && !folio_test_swapcache (folio )) {
4403
+ unsigned long nr = folio_nr_pages (folio );
4404
+ unsigned long folio_start = ALIGN_DOWN (vmf -> address , nr * PAGE_SIZE );
4405
+ unsigned long idx = (vmf -> address - folio_start ) / PAGE_SIZE ;
4406
+ pte_t * folio_ptep = vmf -> pte - idx ;
4407
+ pte_t folio_pte = ptep_get (folio_ptep );
4408
+
4409
+ if (!pte_same (folio_pte , pte_move_swp_offset (vmf -> orig_pte , - idx )) ||
4410
+ swap_pte_batch (folio_ptep , nr , folio_pte ) != nr )
4411
+ goto out_nomap ;
4412
+
4413
+ page_idx = idx ;
4414
+ address = folio_start ;
4415
+ ptep = folio_ptep ;
4416
+ goto check_folio ;
4417
+ }
4418
+
4213
4419
nr_pages = 1 ;
4214
4420
page_idx = 0 ;
4215
4421
address = vmf -> address ;
@@ -4341,11 +4547,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
4341
4547
folio_add_lru_vma (folio , vma );
4342
4548
} else if (!folio_test_anon (folio )) {
4343
4549
/*
4344
- * We currently only expect small !anon folios, which are either
4345
- * fully exclusive or fully shared. If we ever get large folios
4346
- * here, we have to be careful.
4550
+ * We currently only expect small !anon folios which are either
4551
+ * fully exclusive or fully shared, or new allocated large
4552
+ * folios which are fully exclusive. If we ever get large
4553
+ * folios within swapcache here, we have to be careful.
4347
4554
*/
4348
- VM_WARN_ON_ONCE (folio_test_large (folio ));
4555
+ VM_WARN_ON_ONCE (folio_test_large (folio ) && folio_test_swapcache ( folio ) );
4349
4556
VM_WARN_ON_FOLIO (!folio_test_locked (folio ), folio );
4350
4557
folio_add_new_anon_rmap (folio , vma , address , rmap_flags );
4351
4558
} else {
@@ -4388,7 +4595,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
4388
4595
out :
4389
4596
/* Clear the swap cache pin for direct swapin after PTL unlock */
4390
4597
if (need_clear_cache )
4391
- swapcache_clear (si , entry , 1 );
4598
+ swapcache_clear (si , entry , nr_pages );
4392
4599
if (si )
4393
4600
put_swap_device (si );
4394
4601
return ret ;
@@ -4404,7 +4611,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
4404
4611
folio_put (swapcache );
4405
4612
}
4406
4613
if (need_clear_cache )
4407
- swapcache_clear (si , entry , 1 );
4614
+ swapcache_clear (si , entry , nr_pages );
4408
4615
if (si )
4409
4616
put_swap_device (si );
4410
4617
return ret ;
0 commit comments