Skip to content

Commit d06af58

Browse files
Zhigang Luoalexdeucher
authored andcommitted
amd/amdkfd: sync all devices to wait all processes being evicted
If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault. Signed-off-by: Zhigang Luo <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent e33997e commit d06af58

File tree

1 file changed

+6
-11
lines changed

1 file changed

+6
-11
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_device.c

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -960,20 +960,17 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
960960
{
961961
struct kfd_node *node;
962962
int i;
963-
int count;
964963

965964
if (!kfd->init_complete)
966965
return;
967966

968967
/* for runtime suspend, skip locking kfd */
969968
if (!run_pm) {
970969
mutex_lock(&kfd_processes_mutex);
971-
count = ++kfd_locked;
972-
mutex_unlock(&kfd_processes_mutex);
973-
974970
/* For first KFD device suspend all the KFD processes */
975-
if (count == 1)
971+
if (++kfd_locked == 1)
976972
kfd_suspend_all_processes();
973+
mutex_unlock(&kfd_processes_mutex);
977974
}
978975

979976
for (i = 0; i < kfd->num_nodes; i++) {
@@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
984981

985982
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
986983
{
987-
int ret, count, i;
984+
int ret, i;
988985

989986
if (!kfd->init_complete)
990987
return 0;
@@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
998995
/* for runtime resume, skip unlocking kfd */
999996
if (!run_pm) {
1000997
mutex_lock(&kfd_processes_mutex);
1001-
count = --kfd_locked;
1002-
mutex_unlock(&kfd_processes_mutex);
1003-
1004-
WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
1005-
if (count == 0)
998+
if (--kfd_locked == 0)
1006999
ret = kfd_resume_all_processes();
1000+
WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
1001+
mutex_unlock(&kfd_processes_mutex);
10071002
}
10081003

10091004
return ret;

0 commit comments

Comments
 (0)