Skip to content

Commit 20bc9f7

Browse files
dpbelangeralexdeucher
authored andcommitted
drm/amdkfd: Fixed kfd_process cleanup on module exit.
Handle case when module is unloaded (kfd_exit) before a process space (mm_struct) is released. v2: Fixed potential race conditions by removing all kfd_process from the process table first, then working on releasing the resources. v3: Fixed loop element access / synchronization. Fixed extra empty lines. Signed-off-by: David Belanger <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 7304ee9 commit 20bc9f7

File tree

3 files changed

+62
-7
lines changed

3 files changed

+62
-7
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_module.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ static int kfd_init(void)
7777

7878
static void kfd_exit(void)
7979
{
80+
kfd_cleanup_processes();
8081
kfd_debugfs_fini();
8182
kfd_process_destroy_wq();
8283
kfd_procfs_shutdown();

drivers/gpu/drm/amd/amdkfd/kfd_priv.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
928928

929929
int kfd_process_create_wq(void);
930930
void kfd_process_destroy_wq(void);
931+
void kfd_cleanup_processes(void);
931932
struct kfd_process *kfd_create_process(struct file *filep);
932933
struct kfd_process *kfd_get_process(const struct task_struct *task);
933934
struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);

drivers/gpu/drm/amd/amdkfd/kfd_process.c

Lines changed: 60 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,6 +1167,17 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn)
11671167
kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
11681168
}
11691169

1170+
static void kfd_process_notifier_release_internal(struct kfd_process *p)
1171+
{
1172+
cancel_delayed_work_sync(&p->eviction_work);
1173+
cancel_delayed_work_sync(&p->restore_work);
1174+
1175+
/* Indicate to other users that MM is no longer valid */
1176+
p->mm = NULL;
1177+
1178+
mmu_notifier_put(&p->mmu_notifier);
1179+
}
1180+
11701181
static void kfd_process_notifier_release(struct mmu_notifier *mn,
11711182
struct mm_struct *mm)
11721183
{
@@ -1181,17 +1192,22 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
11811192
return;
11821193

11831194
mutex_lock(&kfd_processes_mutex);
1195+
/*
1196+
* Do early return if table is empty.
1197+
*
1198+
* This could potentially happen if this function is called concurrently
1199+
* by mmu_notifier and by kfd_cleanup_pocesses.
1200+
*
1201+
*/
1202+
if (hash_empty(kfd_processes_table)) {
1203+
mutex_unlock(&kfd_processes_mutex);
1204+
return;
1205+
}
11841206
hash_del_rcu(&p->kfd_processes);
11851207
mutex_unlock(&kfd_processes_mutex);
11861208
synchronize_srcu(&kfd_processes_srcu);
11871209

1188-
cancel_delayed_work_sync(&p->eviction_work);
1189-
cancel_delayed_work_sync(&p->restore_work);
1190-
1191-
/* Indicate to other users that MM is no longer valid */
1192-
p->mm = NULL;
1193-
1194-
mmu_notifier_put(&p->mmu_notifier);
1210+
kfd_process_notifier_release_internal(p);
11951211
}
11961212

11971213
static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
@@ -1200,6 +1216,43 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
12001216
.free_notifier = kfd_process_free_notifier,
12011217
};
12021218

1219+
/*
1220+
* This code handles the case when driver is being unloaded before all
1221+
* mm_struct are released. We need to safely free the kfd_process and
1222+
* avoid race conditions with mmu_notifier that might try to free them.
1223+
*
1224+
*/
1225+
void kfd_cleanup_processes(void)
1226+
{
1227+
struct kfd_process *p;
1228+
struct hlist_node *p_temp;
1229+
unsigned int temp;
1230+
HLIST_HEAD(cleanup_list);
1231+
1232+
/*
1233+
* Move all remaining kfd_process from the process table to a
1234+
* temp list for processing. Once done, callback from mmu_notifier
1235+
* release will not see the kfd_process in the table and do early return,
1236+
* avoiding double free issues.
1237+
*/
1238+
mutex_lock(&kfd_processes_mutex);
1239+
hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
1240+
hash_del_rcu(&p->kfd_processes);
1241+
synchronize_srcu(&kfd_processes_srcu);
1242+
hlist_add_head(&p->kfd_processes, &cleanup_list);
1243+
}
1244+
mutex_unlock(&kfd_processes_mutex);
1245+
1246+
hlist_for_each_entry_safe(p, p_temp, &cleanup_list, kfd_processes)
1247+
kfd_process_notifier_release_internal(p);
1248+
1249+
/*
1250+
* Ensures that all outstanding free_notifier get called, triggering
1251+
* the release of the kfd_process struct.
1252+
*/
1253+
mmu_notifier_synchronize();
1254+
}
1255+
12031256
static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
12041257
{
12051258
unsigned long offset;

0 commit comments

Comments
 (0)