Skip to content

Commit 1a799c4

Browse files
PhilipYangAalexdeucher
authored andcommitted
drm/amdkfd: Fix double release compute pasid
If kfd_process_device_init_vm returns failure after vm is converted to compute vm and vm->pasid set to compute pasid, KFD will not take pdd->drm_file reference. As a result, drm close file handler maybe called to release the compute pasid before KFD process destroy worker to release the same pasid and set vm->pasid to zero, this generates below WARNING backtrace and NULL pointer access. Add helper amdgpu_amdkfd_gpuvm_set_vm_pasid and call it at the last step of kfd_process_device_init_vm, to ensure vm pasid is the original pasid if acquiring vm failed or is the compute pasid with pdd->drm_file reference taken to avoid double release same pasid. amdgpu: Failed to create process VM object ida_free called for id=32770 which is not allocated. WARNING: CPU: 57 PID: 72542 at ../lib/idr.c:522 ida_free+0x96/0x140 RIP: 0010:ida_free+0x96/0x140 Call Trace: amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu] amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu] drm_file_free.part.13+0x216/0x270 [drm] drm_close_helper.isra.14+0x60/0x70 [drm] drm_release+0x6e/0xf0 [drm] __fput+0xcc/0x280 ____fput+0xe/0x20 task_work_run+0x96/0xc0 do_exit+0x3d0/0xc10 BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:ida_free+0x76/0x140 Call Trace: amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu] amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu] drm_file_free.part.13+0x216/0x270 [drm] drm_close_helper.isra.14+0x60/0x70 [drm] drm_release+0x6e/0xf0 [drm] __fput+0xcc/0x280 ____fput+0xe/0x20 task_work_run+0x96/0xc0 do_exit+0x3d0/0xc10 Signed-off-by: Philip Yang <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 29d48b8 commit 1a799c4

File tree

3 files changed

+40
-15
lines changed

3 files changed

+40
-15
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,10 @@ int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_
270270
(&((struct amdgpu_fpriv *) \
271271
((struct drm_file *)(drm_priv))->driver_priv)->vm)
272272

273+
int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
274+
struct file *filp, u32 pasid);
273275
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
274-
struct file *filp, u32 pasid,
276+
struct file *filp,
275277
void **process_info,
276278
struct dma_fence **ef);
277279
void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,10 +1429,9 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
14291429
amdgpu_bo_unreserve(bo);
14301430
}
14311431

1432-
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
1433-
struct file *filp, u32 pasid,
1434-
void **process_info,
1435-
struct dma_fence **ef)
1432+
int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
1433+
struct file *filp, u32 pasid)
1434+
14361435
{
14371436
struct amdgpu_fpriv *drv_priv;
14381437
struct amdgpu_vm *avm;
@@ -1443,10 +1442,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
14431442
return ret;
14441443
avm = &drv_priv->vm;
14451444

1446-
/* Already a compute VM? */
1447-
if (avm->process_info)
1448-
return -EINVAL;
1449-
14501445
/* Free the original amdgpu allocated pasid,
14511446
* will be replaced with kfd allocated pasid.
14521447
*/
@@ -1455,14 +1450,36 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
14551450
amdgpu_vm_set_pasid(adev, avm, 0);
14561451
}
14571452

1458-
/* Convert VM into a compute VM */
1459-
ret = amdgpu_vm_make_compute(adev, avm);
1453+
ret = amdgpu_vm_set_pasid(adev, avm, pasid);
14601454
if (ret)
14611455
return ret;
14621456

1463-
ret = amdgpu_vm_set_pasid(adev, avm, pasid);
1457+
return 0;
1458+
}
1459+
1460+
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
1461+
struct file *filp,
1462+
void **process_info,
1463+
struct dma_fence **ef)
1464+
{
1465+
struct amdgpu_fpriv *drv_priv;
1466+
struct amdgpu_vm *avm;
1467+
int ret;
1468+
1469+
ret = amdgpu_file_to_fpriv(filp, &drv_priv);
14641470
if (ret)
14651471
return ret;
1472+
avm = &drv_priv->vm;
1473+
1474+
/* Already a compute VM? */
1475+
if (avm->process_info)
1476+
return -EINVAL;
1477+
1478+
/* Convert VM into a compute VM */
1479+
ret = amdgpu_vm_make_compute(adev, avm);
1480+
if (ret)
1481+
return ret;
1482+
14661483
/* Initialize KFD part of the VM and process info */
14671484
ret = init_kfd_vm(avm, process_info, ef);
14681485
if (ret)

drivers/gpu/drm/amd/amdkfd/kfd_process.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1576,9 +1576,9 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
15761576
p = pdd->process;
15771577
dev = pdd->dev;
15781578

1579-
ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(
1580-
dev->adev, drm_file, p->pasid,
1581-
&p->kgd_process_info, &p->ef);
1579+
ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, drm_file,
1580+
&p->kgd_process_info,
1581+
&p->ef);
15821582
if (ret) {
15831583
pr_err("Failed to create process VM object\n");
15841584
return ret;
@@ -1593,10 +1593,16 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
15931593
if (ret)
15941594
goto err_init_cwsr;
15951595

1596+
ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, drm_file, p->pasid);
1597+
if (ret)
1598+
goto err_set_pasid;
1599+
15961600
pdd->drm_file = drm_file;
15971601

15981602
return 0;
15991603

1604+
err_set_pasid:
1605+
kfd_process_device_destroy_cwsr_dgpu(pdd);
16001606
err_init_cwsr:
16011607
kfd_process_device_destroy_ib_mem(pdd);
16021608
err_reserve_ib_mem:

0 commit comments

Comments
 (0)