Skip to content

Commit ff891a2

Browse files
PhilipYangAalexdeucher
authored andcommitted
drm/amdkfd: check access permisson to restore retry fault
Check range access permission to restore GPU retry fault, if GPU retry fault on address which belongs to VMA, and VMA has no read or write permission requested by GPU, failed to restore the address. The vm fault event will pass back to user space. Signed-off-by: Philip Yang <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent f24d991 commit ff891a2

File tree

6 files changed

+39
-8
lines changed

6 files changed

+39
-8
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3345,12 +3345,13 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
33453345
* @adev: amdgpu device pointer
33463346
* @pasid: PASID of the VM
33473347
* @addr: Address of the fault
3348+
* @write_fault: true is write fault, false is read fault
33483349
*
33493350
* Try to gracefully handle a VM fault. Return true if the fault was handled and
33503351
* shouldn't be reported any more.
33513352
*/
33523353
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
3353-
uint64_t addr)
3354+
uint64_t addr, bool write_fault)
33543355
{
33553356
bool is_compute_context = false;
33563357
struct amdgpu_bo *root;
@@ -3375,7 +3376,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
33753376
addr /= AMDGPU_GPU_PAGE_SIZE;
33763377

33773378
if (is_compute_context &&
3378-
!svm_range_restore_pages(adev, pasid, addr)) {
3379+
!svm_range_restore_pages(adev, pasid, addr, write_fault)) {
33793380
amdgpu_bo_unref(&root);
33803381
return true;
33813382
}

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
448448
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
449449
struct amdgpu_task_info *task_info);
450450
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
451-
uint64_t addr);
451+
uint64_t addr, bool write_fault);
452452

453453
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
454454

drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
9393
struct amdgpu_iv_entry *entry)
9494
{
9595
bool retry_fault = !!(entry->src_data[1] & 0x80);
96+
bool write_fault = !!(entry->src_data[1] & 0x20);
9697
struct amdgpu_vmhub *hub = &adev->vmhub[entry->vmid_src];
9798
struct amdgpu_task_info task_info;
9899
uint32_t status = 0;
@@ -121,7 +122,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
121122
/* Try to handle the recoverable page faults by filling page
122123
* tables
123124
*/
124-
if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
125+
if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, write_fault))
125126
return 1;
126127
}
127128

drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
507507
struct amdgpu_iv_entry *entry)
508508
{
509509
bool retry_fault = !!(entry->src_data[1] & 0x80);
510+
bool write_fault = !!(entry->src_data[1] & 0x20);
510511
uint32_t status = 0, cid = 0, rw = 0;
511512
struct amdgpu_task_info task_info;
512513
struct amdgpu_vmhub *hub;
@@ -537,7 +538,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
537538
/* Try to handle the recoverable page faults by filling page
538539
* tables
539540
*/
540-
if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
541+
if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, write_fault))
541542
return 1;
542543
}
543544

drivers/gpu/drm/amd/amdkfd/kfd_svm.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2400,9 +2400,29 @@ svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
24002400
WRITE_ONCE(pdd->faults, pdd->faults + 1);
24012401
}
24022402

2403+
static bool
2404+
svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool write_fault)
2405+
{
2406+
unsigned long requested = VM_READ;
2407+
struct vm_area_struct *vma;
2408+
2409+
if (write_fault)
2410+
requested |= VM_WRITE;
2411+
2412+
vma = find_vma(mm, addr << PAGE_SHIFT);
2413+
if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2414+
pr_debug("address 0x%llx VMA is removed\n", addr);
2415+
return true;
2416+
}
2417+
2418+
pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
2419+
vma->vm_flags);
2420+
return (vma->vm_flags & requested) == requested;
2421+
}
2422+
24032423
int
24042424
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2405-
uint64_t addr)
2425+
uint64_t addr, bool write_fault)
24062426
{
24072427
struct mm_struct *mm = NULL;
24082428
struct svm_range_list *svms;
@@ -2484,6 +2504,13 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
24842504
goto out_unlock_range;
24852505
}
24862506

2507+
if (!svm_fault_allowed(mm, addr, write_fault)) {
2508+
pr_debug("fault addr 0x%llx no %s permission\n", addr,
2509+
write_fault ? "write" : "read");
2510+
r = -EPERM;
2511+
goto out_unlock_range;
2512+
}
2513+
24872514
best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
24882515
if (best_loc == -1) {
24892516
pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",

drivers/gpu/drm/amd/amdkfd/kfd_svm.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ int svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
175175
unsigned long addr, struct svm_range *parent,
176176
struct svm_range *prange);
177177
int svm_range_restore_pages(struct amdgpu_device *adev,
178-
unsigned int pasid, uint64_t addr);
178+
unsigned int pasid, uint64_t addr, bool write_fault);
179179
int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
180180
void svm_range_add_list_work(struct svm_range_list *svms,
181181
struct svm_range *prange, struct mm_struct *mm,
@@ -209,7 +209,8 @@ static inline void svm_range_list_fini(struct kfd_process *p)
209209
}
210210

211211
static inline int svm_range_restore_pages(struct amdgpu_device *adev,
212-
unsigned int pasid, uint64_t addr)
212+
unsigned int pasid, uint64_t addr,
213+
bool write_fault)
213214
{
214215
return -EFAULT;
215216
}

0 commit comments

Comments
 (0)