@@ -1028,7 +1028,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
1028
1028
return ret ;
1029
1029
}
1030
1030
1031
- static long btrfs_scan_inode (struct btrfs_inode * inode , long * scanned , long nr_to_scan )
1031
+ struct btrfs_em_shrink_ctx {
1032
+ long nr_to_scan ;
1033
+ long scanned ;
1034
+ u64 last_ino ;
1035
+ u64 last_root ;
1036
+ };
1037
+
1038
+ static long btrfs_scan_inode (struct btrfs_inode * inode , struct btrfs_em_shrink_ctx * ctx )
1032
1039
{
1033
1040
const u64 cur_fs_gen = btrfs_get_fs_generation (inode -> root -> fs_info );
1034
1041
struct extent_map_tree * tree = & inode -> extent_tree ;
@@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
1057
1064
if (!down_read_trylock (& inode -> i_mmap_lock ))
1058
1065
return 0 ;
1059
1066
1060
- write_lock (& tree -> lock );
1067
+ /*
1068
+ * We want to be fast because we can be called from any path trying to
1069
+ * allocate memory, so if the lock is busy we don't want to spend time
1070
+ * waiting for it - either some task is about to do IO for the inode or
1071
+ * we may have another task shrinking extent maps, here in this code, so
1072
+ * skip this inode.
1073
+ */
1074
+ if (!write_trylock (& tree -> lock )) {
1075
+ up_read (& inode -> i_mmap_lock );
1076
+ return 0 ;
1077
+ }
1078
+
1061
1079
node = rb_first_cached (& tree -> map );
1062
1080
while (node ) {
1063
1081
struct extent_map * em ;
1064
1082
1065
1083
em = rb_entry (node , struct extent_map , rb_node );
1066
1084
node = rb_next (node );
1067
- ( * scanned ) ++ ;
1085
+ ctx -> scanned ++ ;
1068
1086
1069
1087
if (em -> flags & EXTENT_FLAG_PINNED )
1070
1088
goto next ;
@@ -1085,42 +1103,49 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
1085
1103
free_extent_map (em );
1086
1104
nr_dropped ++ ;
1087
1105
next :
1088
- if (* scanned >= nr_to_scan )
1106
+ if (ctx -> scanned >= ctx -> nr_to_scan )
1089
1107
break ;
1090
1108
1091
1109
/*
1092
- * Restart if we had to reschedule, and any extent maps that were
1093
- * pinned before may have become unpinned after we released the
1094
- * lock and took it again.
1110
+ * Stop if we need to reschedule or there's contention on the
1111
+ * lock. This is to avoid slowing other tasks trying to take the
1112
+ * lock and because the shrinker might be called during a memory
1113
+ * allocation path and we want to avoid taking a very long time
1114
+ * and slowing down all sorts of tasks.
1095
1115
*/
1096
- if (cond_resched_rwlock_write (& tree -> lock ))
1097
- node = rb_first_cached ( & tree -> map ) ;
1116
+ if (need_resched () || rwlock_needbreak (& tree -> lock ))
1117
+ break ;
1098
1118
}
1099
1119
write_unlock (& tree -> lock );
1100
1120
up_read (& inode -> i_mmap_lock );
1101
1121
1102
1122
return nr_dropped ;
1103
1123
}
1104
1124
1105
- static long btrfs_scan_root (struct btrfs_root * root , long * scanned , long nr_to_scan )
1125
+ static long btrfs_scan_root (struct btrfs_root * root , struct btrfs_em_shrink_ctx * ctx )
1106
1126
{
1107
- struct btrfs_fs_info * fs_info = root -> fs_info ;
1108
1127
struct btrfs_inode * inode ;
1109
1128
long nr_dropped = 0 ;
1110
- u64 min_ino = fs_info -> extent_map_shrinker_last_ino + 1 ;
1129
+ u64 min_ino = ctx -> last_ino + 1 ;
1111
1130
1112
1131
inode = btrfs_find_first_inode (root , min_ino );
1113
1132
while (inode ) {
1114
- nr_dropped += btrfs_scan_inode (inode , scanned , nr_to_scan );
1133
+ nr_dropped += btrfs_scan_inode (inode , ctx );
1115
1134
1116
1135
min_ino = btrfs_ino (inode ) + 1 ;
1117
- fs_info -> extent_map_shrinker_last_ino = btrfs_ino (inode );
1118
- iput ( & inode -> vfs_inode );
1136
+ ctx -> last_ino = btrfs_ino (inode );
1137
+ btrfs_add_delayed_iput ( inode );
1119
1138
1120
- if (* scanned >= nr_to_scan )
1139
+ if (ctx -> scanned >= ctx -> nr_to_scan )
1140
+ break ;
1141
+
1142
+ /*
1143
+ * We may be called from memory allocation paths, so we don't
1144
+ * want to take too much time and slowdown tasks.
1145
+ */
1146
+ if (need_resched ())
1121
1147
break ;
1122
1148
1123
- cond_resched ();
1124
1149
inode = btrfs_find_first_inode (root , min_ino );
1125
1150
}
1126
1151
@@ -1132,34 +1157,56 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
1132
1157
* inode if there is one or we will find out this was the last
1133
1158
* one and move to the next root.
1134
1159
*/
1135
- fs_info -> extent_map_shrinker_last_root = btrfs_root_id (root );
1160
+ ctx -> last_root = btrfs_root_id (root );
1136
1161
} else {
1137
1162
/*
1138
1163
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
1139
1164
* that when processing the next root we start from its first inode.
1140
1165
*/
1141
- fs_info -> extent_map_shrinker_last_ino = 0 ;
1142
- fs_info -> extent_map_shrinker_last_root = btrfs_root_id (root ) + 1 ;
1166
+ ctx -> last_ino = 0 ;
1167
+ ctx -> last_root = btrfs_root_id (root ) + 1 ;
1143
1168
}
1144
1169
1145
1170
return nr_dropped ;
1146
1171
}
1147
1172
1148
1173
long btrfs_free_extent_maps (struct btrfs_fs_info * fs_info , long nr_to_scan )
1149
1174
{
1150
- const u64 start_root_id = fs_info -> extent_map_shrinker_last_root ;
1151
- u64 next_root_id = start_root_id ;
1175
+ struct btrfs_em_shrink_ctx ctx ;
1176
+ u64 start_root_id ;
1177
+ u64 next_root_id ;
1152
1178
bool cycled = false;
1153
1179
long nr_dropped = 0 ;
1154
- long scanned = 0 ;
1180
+
1181
+ ctx .scanned = 0 ;
1182
+ ctx .nr_to_scan = nr_to_scan ;
1183
+
1184
+ /*
1185
+ * In case we have multiple tasks running this shrinker, make the next
1186
+ * one start from the next inode in case it starts before we finish.
1187
+ */
1188
+ spin_lock (& fs_info -> extent_map_shrinker_lock );
1189
+ ctx .last_ino = fs_info -> extent_map_shrinker_last_ino ;
1190
+ fs_info -> extent_map_shrinker_last_ino ++ ;
1191
+ ctx .last_root = fs_info -> extent_map_shrinker_last_root ;
1192
+ spin_unlock (& fs_info -> extent_map_shrinker_lock );
1193
+
1194
+ start_root_id = ctx .last_root ;
1195
+ next_root_id = ctx .last_root ;
1155
1196
1156
1197
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled ()) {
1157
1198
s64 nr = percpu_counter_sum_positive (& fs_info -> evictable_extent_maps );
1158
1199
1159
- trace_btrfs_extent_map_shrinker_scan_enter (fs_info , nr_to_scan , nr );
1200
+ trace_btrfs_extent_map_shrinker_scan_enter (fs_info , nr_to_scan ,
1201
+ nr , ctx .last_root ,
1202
+ ctx .last_ino );
1160
1203
}
1161
1204
1162
- while (scanned < nr_to_scan ) {
1205
+ /*
1206
+ * We may be called from memory allocation paths, so we don't want to
1207
+ * take too much time and slowdown tasks, so stop if we need reschedule.
1208
+ */
1209
+ while (ctx .scanned < ctx .nr_to_scan && !need_resched ()) {
1163
1210
struct btrfs_root * root ;
1164
1211
unsigned long count ;
1165
1212
@@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
1171
1218
spin_unlock (& fs_info -> fs_roots_radix_lock );
1172
1219
if (start_root_id > 0 && !cycled ) {
1173
1220
next_root_id = 0 ;
1174
- fs_info -> extent_map_shrinker_last_root = 0 ;
1175
- fs_info -> extent_map_shrinker_last_ino = 0 ;
1221
+ ctx . last_root = 0 ;
1222
+ ctx . last_ino = 0 ;
1176
1223
cycled = true;
1177
1224
continue ;
1178
1225
}
@@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
1186
1233
continue ;
1187
1234
1188
1235
if (is_fstree (btrfs_root_id (root )))
1189
- nr_dropped += btrfs_scan_root (root , & scanned , nr_to_scan );
1236
+ nr_dropped += btrfs_scan_root (root , & ctx );
1190
1237
1191
1238
btrfs_put_root (root );
1192
1239
}
1193
1240
1241
+ /*
1242
+ * In case of multiple tasks running this extent map shrinking code this
1243
+ * isn't perfect but it's simple and silences things like KCSAN. It's
1244
+ * not possible to know which task made more progress because we can
1245
+ * cycle back to the first root and first inode if it's not the first
1246
+ * time the shrinker ran, see the above logic. Also a task that started
1247
+ * later may finish ealier than another task and made less progress. So
1248
+ * make this simple and update to the progress of the last task that
1249
+ * finished, with the occasional possiblity of having two consecutive
1250
+ * runs of the shrinker process the same inodes.
1251
+ */
1252
+ spin_lock (& fs_info -> extent_map_shrinker_lock );
1253
+ fs_info -> extent_map_shrinker_last_ino = ctx .last_ino ;
1254
+ fs_info -> extent_map_shrinker_last_root = ctx .last_root ;
1255
+ spin_unlock (& fs_info -> extent_map_shrinker_lock );
1256
+
1194
1257
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled ()) {
1195
1258
s64 nr = percpu_counter_sum_positive (& fs_info -> evictable_extent_maps );
1196
1259
1197
- trace_btrfs_extent_map_shrinker_scan_exit (fs_info , nr_dropped , nr );
1260
+ trace_btrfs_extent_map_shrinker_scan_exit (fs_info , nr_dropped ,
1261
+ nr , ctx .last_root ,
1262
+ ctx .last_ino );
1198
1263
}
1199
1264
1200
1265
return nr_dropped ;
0 commit comments