Skip to content

Commit d69fd95

Browse files
mukjoshialexdeucher
authored andcommitted
drm/amdkfd: Fix circular locking dependency warning
[ 150.887733] ====================================================== [ 150.893903] WARNING: possible circular locking dependency detected [ 150.905917] ------------------------------------------------------ [ 150.912129] kfdtest/4081 is trying to acquire lock: [ 150.917002] ffff8f7f3762e118 (&mm->mmap_sem#2){++++}, at: __might_fault+0x3e/0x90 [ 150.924490] but task is already holding lock: [ 150.930320] ffff8f7f49d229e8 (&dqm->lock_hidden){+.+.}, at: destroy_queue_cpsch+0x29/0x210 [amdgpu] [ 150.939432] which lock already depends on the new lock. [ 150.947603] the existing dependency chain (in reverse order) is: [ 150.955074] -> #3 (&dqm->lock_hidden){+.+.}: [ 150.960822] __mutex_lock+0xa1/0x9f0 [ 150.964996] evict_process_queues_cpsch+0x22/0x120 [amdgpu] [ 150.971155] kfd_process_evict_queues+0x3b/0xc0 [amdgpu] [ 150.977054] kgd2kfd_quiesce_mm+0x25/0x60 [amdgpu] [ 150.982442] amdgpu_amdkfd_evict_userptr+0x35/0x70 [amdgpu] [ 150.988615] amdgpu_mn_invalidate_hsa+0x41/0x60 [amdgpu] [ 150.994448] __mmu_notifier_invalidate_range_start+0xa4/0x240 [ 151.000714] copy_page_range+0xd70/0xd80 [ 151.005159] dup_mm+0x3ca/0x550 [ 151.008816] copy_process+0x1bdc/0x1c70 [ 151.013183] _do_fork+0x76/0x6c0 [ 151.016929] __x64_sys_clone+0x8c/0xb0 [ 151.021201] do_syscall_64+0x4a/0x1d0 [ 151.025404] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 151.030977] -> #2 (&adev->notifier_lock){+.+.}: [ 151.036993] __mutex_lock+0xa1/0x9f0 [ 151.041168] amdgpu_mn_invalidate_hsa+0x30/0x60 [amdgpu] [ 151.047019] __mmu_notifier_invalidate_range_start+0xa4/0x240 [ 151.053277] copy_page_range+0xd70/0xd80 [ 151.057722] dup_mm+0x3ca/0x550 [ 151.061388] copy_process+0x1bdc/0x1c70 [ 151.065748] _do_fork+0x76/0x6c0 [ 151.069499] __x64_sys_clone+0x8c/0xb0 [ 151.073765] do_syscall_64+0x4a/0x1d0 [ 151.077952] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 151.083523] -> #1 (mmu_notifier_invalidate_range_start){+.+.}: [ 151.090833] change_protection+0x802/0xab0 [ 151.095448] mprotect_fixup+0x187/0x2d0 [ 151.099801] setup_arg_pages+0x124/0x250 [ 151.104251] load_elf_binary+0x3a4/0x1464 [ 151.108781] search_binary_handler+0x6c/0x210 [ 151.113656] __do_execve_file.isra.40+0x7f7/0xa50 [ 151.118875] do_execve+0x21/0x30 [ 151.122632] call_usermodehelper_exec_async+0x17e/0x190 [ 151.128393] ret_from_fork+0x24/0x30 [ 151.132489] -> #0 (&mm->mmap_sem#2){++++}: [ 151.138064] __lock_acquire+0x11a1/0x1490 [ 151.142597] lock_acquire+0x90/0x180 [ 151.146694] __might_fault+0x68/0x90 [ 151.150879] read_sdma_queue_counter+0x5f/0xb0 [amdgpu] [ 151.156693] update_sdma_queue_past_activity_stats+0x3b/0x90 [amdgpu] [ 151.163725] destroy_queue_cpsch+0x1ae/0x210 [amdgpu] [ 151.169373] pqm_destroy_queue+0xf0/0x250 [amdgpu] [ 151.174762] kfd_ioctl_destroy_queue+0x32/0x70 [amdgpu] [ 151.180577] kfd_ioctl+0x223/0x400 [amdgpu] [ 151.185284] ksys_ioctl+0x8f/0xb0 [ 151.189118] __x64_sys_ioctl+0x16/0x20 [ 151.193389] do_syscall_64+0x4a/0x1d0 [ 151.197569] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 151.203141] other info that might help us debug this: [ 151.211140] Chain exists of: &mm->mmap_sem#2 --> &adev->notifier_lock --> &dqm->lock_hidden [ 151.222535] Possible unsafe locking scenario: [ 151.228447] CPU0 CPU1 [ 151.232971] ---- ---- [ 151.237502] lock(&dqm->lock_hidden); [ 151.241254] lock(&adev->notifier_lock); [ 151.247774] lock(&dqm->lock_hidden); [ 151.254038] lock(&mm->mmap_sem#2); This commit fixes the warning by ensuring get_user() is not called while reading SDMA stats with dqm_lock held as get_user() could cause a page fault which leads to the circular locking scenario. Signed-off-by: Mukul Joshi <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 7ee78af commit d69fd95

File tree

3 files changed

+158
-61
lines changed

3 files changed

+158
-61
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

Lines changed: 34 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -153,52 +153,30 @@ static void decrement_queue_count(struct device_queue_manager *dqm,
153153
dqm->active_cp_queue_count--;
154154
}
155155

156-
int read_sdma_queue_counter(struct queue *q, uint64_t *val)
156+
int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val)
157157
{
158158
int ret;
159159
uint64_t tmp = 0;
160160

161-
if (!q || !val)
161+
if (!val)
162162
return -EINVAL;
163163
/*
164164
* SDMA activity counter is stored at queue's RPTR + 0x8 location.
165165
*/
166-
if (!access_ok((const void __user *)((uint64_t)q->properties.read_ptr +
166+
if (!access_ok((const void __user *)(q_rptr +
167167
sizeof(uint64_t)), sizeof(uint64_t))) {
168168
pr_err("Can't access sdma queue activity counter\n");
169169
return -EFAULT;
170170
}
171171

172-
ret = get_user(tmp, (uint64_t *)((uint64_t)(q->properties.read_ptr) +
173-
sizeof(uint64_t)));
172+
ret = get_user(tmp, (uint64_t *)(q_rptr + sizeof(uint64_t)));
174173
if (!ret) {
175174
*val = tmp;
176175
}
177176

178177
return ret;
179178
}
180179

181-
static int update_sdma_queue_past_activity_stats(struct kfd_process_device *pdd,
182-
struct queue *q)
183-
{
184-
int ret;
185-
uint64_t val = 0;
186-
187-
if (!pdd)
188-
return -ENODEV;
189-
190-
ret = read_sdma_queue_counter(q, &val);
191-
if (ret) {
192-
pr_err("Failed to read SDMA queue counter for queue: %d\n",
193-
q->properties.queue_id);
194-
return ret;
195-
}
196-
197-
pdd->sdma_past_activity_counter += val;
198-
199-
return ret;
200-
}
201-
202180
static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
203181
{
204182
struct kfd_dev *dev = qpd->dqm->dev;
@@ -533,11 +511,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
533511
if (retval == -ETIME)
534512
qpd->reset_wavefronts = true;
535513

536-
/* Get the SDMA queue stats */
537-
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
538-
(q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
539-
update_sdma_queue_past_activity_stats(qpd_to_pdd(qpd), q);
540-
}
541514

542515
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
543516

@@ -573,9 +546,23 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
573546
struct queue *q)
574547
{
575548
int retval;
549+
uint64_t sdma_val = 0;
550+
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
551+
552+
/* Get the SDMA queue stats */
553+
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
554+
(q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
555+
retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr,
556+
&sdma_val);
557+
if (retval)
558+
pr_err("Failed to read SDMA queue counter for queue: %d\n",
559+
q->properties.queue_id);
560+
}
576561

577562
dqm_lock(dqm);
578563
retval = destroy_queue_nocpsch_locked(dqm, qpd, q);
564+
if (!retval)
565+
pdd->sdma_past_activity_counter += sdma_val;
579566
dqm_unlock(dqm);
580567

581568
return retval;
@@ -1480,6 +1467,18 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
14801467
{
14811468
int retval;
14821469
struct mqd_manager *mqd_mgr;
1470+
uint64_t sdma_val = 0;
1471+
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
1472+
1473+
/* Get the SDMA queue stats */
1474+
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
1475+
(q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
1476+
retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr,
1477+
&sdma_val);
1478+
if (retval)
1479+
pr_err("Failed to read SDMA queue counter for queue: %d\n",
1480+
q->properties.queue_id);
1481+
}
14831482

14841483
retval = 0;
14851484

@@ -1501,10 +1500,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
15011500

15021501
deallocate_doorbell(qpd, q);
15031502

1504-
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
1505-
deallocate_sdma_queue(dqm, q);
1506-
else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
1503+
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
1504+
(q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
15071505
deallocate_sdma_queue(dqm, q);
1506+
pdd->sdma_past_activity_counter += sdma_val;
1507+
}
15081508

15091509
list_del(&q->list);
15101510
qpd->queue_count--;
@@ -1520,11 +1520,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
15201520
}
15211521
}
15221522

1523-
/* Get the SDMA queue stats */
1524-
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
1525-
(q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
1526-
update_sdma_queue_past_activity_stats(qpd_to_pdd(qpd), q);
1527-
}
15281523
/*
15291524
* Unconditionally decrement this counter, regardless of the queue's
15301525
* type

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,5 @@ static inline void dqm_unlock(struct device_queue_manager *dqm)
251251
mutex_unlock(&dqm->lock_hidden);
252252
}
253253

254-
int read_sdma_queue_counter(struct queue *q, uint64_t *val);
255-
254+
int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val);
256255
#endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */

drivers/gpu/drm/amd/amdkfd/kfd_process.c

Lines changed: 123 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ struct kfd_sdma_activity_handler_workarea {
8686
uint64_t sdma_activity_counter;
8787
};
8888

89+
struct temp_sdma_queue_list {
90+
uint64_t rptr;
91+
uint64_t sdma_val;
92+
unsigned int queue_id;
93+
struct list_head list;
94+
};
95+
8996
static void kfd_sdma_activity_worker(struct work_struct *work)
9097
{
9198
struct kfd_sdma_activity_handler_workarea *workarea;
@@ -96,6 +103,8 @@ static void kfd_sdma_activity_worker(struct work_struct *work)
96103
struct qcm_process_device *qpd;
97104
struct device_queue_manager *dqm;
98105
int ret = 0;
106+
struct temp_sdma_queue_list sdma_q_list;
107+
struct temp_sdma_queue_list *sdma_q, *next;
99108

100109
workarea = container_of(work, struct kfd_sdma_activity_handler_workarea,
101110
sdma_activity_work);
@@ -109,41 +118,135 @@ static void kfd_sdma_activity_worker(struct work_struct *work)
109118
qpd = &pdd->qpd;
110119
if (!dqm || !qpd)
111120
return;
121+
/*
122+
* Total SDMA activity is current SDMA activity + past SDMA activity
123+
* Past SDMA count is stored in pdd.
124+
* To get the current activity counters for all active SDMA queues,
125+
* we loop over all SDMA queues and get their counts from user-space.
126+
*
127+
* We cannot call get_user() with dqm_lock held as it can cause
128+
* a circular lock dependency situation. To read the SDMA stats,
129+
* we need to do the following:
130+
*
131+
* 1. Create a temporary list of SDMA queue nodes from the qpd->queues_list,
132+
* with dqm_lock/dqm_unlock().
133+
* 2. Call get_user() for each node in temporary list without dqm_lock.
134+
* Save the SDMA count for each node and also add the count to the total
135+
* SDMA count counter.
136+
* Its possible, during this step, a few SDMA queue nodes got deleted
137+
* from the qpd->queues_list.
138+
* 3. Do a second pass over qpd->queues_list to check if any nodes got deleted.
139+
* If any node got deleted, its SDMA count would be captured in the sdma
140+
* past activity counter. So subtract the SDMA counter stored in step 2
141+
* for this node from the total SDMA count.
142+
*/
143+
INIT_LIST_HEAD(&sdma_q_list.list);
112144

113-
mm = get_task_mm(pdd->process->lead_thread);
114-
if (!mm) {
115-
return;
145+
/*
146+
* Create the temp list of all SDMA queues
147+
*/
148+
dqm_lock(dqm);
149+
150+
list_for_each_entry(q, &qpd->queues_list, list) {
151+
if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) &&
152+
(q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI))
153+
continue;
154+
155+
sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL);
156+
if (!sdma_q) {
157+
dqm_unlock(dqm);
158+
goto cleanup;
159+
}
160+
161+
INIT_LIST_HEAD(&sdma_q->list);
162+
sdma_q->rptr = (uint64_t)q->properties.read_ptr;
163+
sdma_q->queue_id = q->properties.queue_id;
164+
list_add_tail(&sdma_q->list, &sdma_q_list.list);
116165
}
117166

118-
use_mm(mm);
167+
/*
168+
* If the temp list is empty, then no SDMA queues nodes were found in
169+
* qpd->queues_list. Return the past activity count as the total sdma
170+
* count
171+
*/
172+
if (list_empty(&sdma_q_list.list)) {
173+
workarea->sdma_activity_counter = pdd->sdma_past_activity_counter;
174+
dqm_unlock(dqm);
175+
return;
176+
}
119177

120-
dqm_lock(dqm);
178+
dqm_unlock(dqm);
121179

122180
/*
123-
* Total SDMA activity is current SDMA activity + past SDMA activity
181+
* Get the usage count for each SDMA queue in temp_list.
124182
*/
125-
workarea->sdma_activity_counter = pdd->sdma_past_activity_counter;
183+
mm = get_task_mm(pdd->process->lead_thread);
184+
if (!mm)
185+
goto cleanup;
186+
187+
use_mm(mm);
188+
189+
list_for_each_entry(sdma_q, &sdma_q_list.list, list) {
190+
val = 0;
191+
ret = read_sdma_queue_counter(sdma_q->rptr, &val);
192+
if (ret) {
193+
pr_debug("Failed to read SDMA queue active counter for queue id: %d",
194+
sdma_q->queue_id);
195+
} else {
196+
sdma_q->sdma_val = val;
197+
workarea->sdma_activity_counter += val;
198+
}
199+
}
200+
201+
unuse_mm(mm);
202+
mmput(mm);
126203

127204
/*
128-
* Get the current activity counters for all active SDMA queues
205+
* Do a second iteration over qpd_queues_list to check if any SDMA
206+
* nodes got deleted while fetching SDMA counter.
129207
*/
208+
dqm_lock(dqm);
209+
210+
workarea->sdma_activity_counter += pdd->sdma_past_activity_counter;
211+
130212
list_for_each_entry(q, &qpd->queues_list, list) {
131-
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
132-
(q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
133-
val = 0;
134-
ret = read_sdma_queue_counter(q, &val);
135-
if (ret)
136-
pr_debug("Failed to read SDMA queue active "
137-
"counter for queue id: %d",
138-
q->properties.queue_id);
139-
else
140-
workarea->sdma_activity_counter += val;
213+
if (list_empty(&sdma_q_list.list))
214+
break;
215+
216+
if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) &&
217+
(q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI))
218+
continue;
219+
220+
list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
221+
if (((uint64_t)q->properties.read_ptr == sdma_q->rptr) &&
222+
(sdma_q->queue_id == q->properties.queue_id)) {
223+
list_del(&sdma_q->list);
224+
kfree(sdma_q);
225+
break;
226+
}
141227
}
142228
}
143229

144230
dqm_unlock(dqm);
145-
unuse_mm(mm);
146-
mmput(mm);
231+
232+
/*
233+
* If temp list is not empty, it implies some queues got deleted
234+
* from qpd->queues_list during SDMA usage read. Subtract the SDMA
235+
* count for each node from the total SDMA count.
236+
*/
237+
list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
238+
workarea->sdma_activity_counter -= sdma_q->sdma_val;
239+
list_del(&sdma_q->list);
240+
kfree(sdma_q);
241+
}
242+
243+
return;
244+
245+
cleanup:
246+
list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) {
247+
list_del(&sdma_q->list);
248+
kfree(sdma_q);
249+
}
147250
}
148251

149252
static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr,

0 commit comments

Comments
 (0)