Skip to content

Commit b8f67b9

Browse files
contactshashanksharmaalexdeucher
authored andcommitted
drm/amdgpu: change vm->task_info handling
This patch changes the handling and lifecycle of vm->task_info object. The major changes are: - vm->task_info is a dynamically allocated ptr now, and its uasge is reference counted. - introducing two new helper funcs for task_info lifecycle management - amdgpu_vm_get_task_info: reference counts up task_info before returning this info - amdgpu_vm_put_task_info: reference counts down task_info - last put to task_info() frees task_info from the vm. This patch also does logistical changes required for existing usage of vm->task_info. V2: Do not block all the prints when task_info not found (Felix) V3: Fixed review comments from Felix - Fix wrong indentation - No debug message for -ENOMEM - Add NULL check for task_info - Do not duplicate the debug messages (ti vs no ti) - Get first reference of task_info in vm_init(), put last in vm_fini() V4: Fixed review comments from Felix - fix double reference increment in create_task_info - change amdgpu_vm_get_task_info_pasid - additional changes in amdgpu_gem.c while porting Cc: Christian Koenig <[email protected]> Cc: Alex Deucher <[email protected]> Cc: Felix Kuehling <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Shashank Sharma <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 68e05b9 commit b8f67b9

File tree

14 files changed

+259
-129
lines changed

14 files changed

+259
-129
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,9 +1782,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused)
17821782
list_for_each_entry(file, &dev->filelist, lhead) {
17831783
struct amdgpu_fpriv *fpriv = file->driver_priv;
17841784
struct amdgpu_vm *vm = &fpriv->vm;
1785+
struct amdgpu_task_info *ti;
1786+
1787+
ti = amdgpu_vm_get_task_info_vm(vm);
1788+
if (ti) {
1789+
seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid, ti->process_name);
1790+
amdgpu_vm_put_task_info(ti);
1791+
}
17851792

1786-
seq_printf(m, "pid:%d\tProcess:%s ----------\n",
1787-
vm->task_info.pid, vm->task_info.process_name);
17881793
r = amdgpu_bo_reserve(vm->root.bo, true);
17891794
if (r)
17901795
break;

drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,9 +208,15 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj,
208208
if (!WARN_ON(!vm->process_info->eviction_fence)) {
209209
r = amdgpu_amdkfd_bo_validate_and_fence(abo, AMDGPU_GEM_DOMAIN_GTT,
210210
&vm->process_info->eviction_fence->base);
211-
if (r)
212-
dev_warn(adev->dev, "%d: validate_and_fence failed: %d\n",
213-
vm->task_info.pid, r);
211+
if (r) {
212+
struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm);
213+
214+
dev_warn(adev->dev, "validate_and_fence failed: %d\n", r);
215+
if (ti) {
216+
dev_warn(adev->dev, "pid %d\n", ti->pid);
217+
amdgpu_vm_put_task_info(ti);
218+
}
219+
}
214220
}
215221
mutex_unlock(&vm->process_info->lock);
216222

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
3535
{
3636
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
3737
struct amdgpu_job *job = to_amdgpu_job(s_job);
38-
struct amdgpu_task_info ti;
38+
struct amdgpu_task_info *ti;
3939
struct amdgpu_device *adev = ring->adev;
4040
int idx;
4141
int r;
@@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
4848
return DRM_GPU_SCHED_STAT_ENODEV;
4949
}
5050

51-
memset(&ti, 0, sizeof(struct amdgpu_task_info));
51+
5252
adev->job_hang = true;
5353

5454
if (amdgpu_gpu_recovery &&
@@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
5858
goto exit;
5959
}
6060

61-
amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
6261
DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
63-
job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
64-
ring->fence_drv.sync_seq);
65-
DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
66-
ti.process_name, ti.tgid, ti.task_name, ti.pid);
62+
job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
63+
ring->fence_drv.sync_seq);
64+
65+
ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
66+
if (ti) {
67+
DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
68+
ti->process_name, ti->tgid, ti->task_name, ti->pid);
69+
amdgpu_vm_put_task_info(ti);
70+
}
6771

6872
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
6973

drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,16 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
230230

231231
coredump->reset_vram_lost = vram_lost;
232232

233-
if (reset_context->job && reset_context->job->vm)
234-
coredump->reset_task_info = reset_context->job->vm->task_info;
233+
if (reset_context->job && reset_context->job->vm) {
234+
struct amdgpu_task_info *ti;
235+
struct amdgpu_vm *vm = reset_context->job->vm;
236+
237+
ti = amdgpu_vm_get_task_info_vm(vm);
238+
if (ti) {
239+
coredump->reset_task_info = *ti;
240+
amdgpu_vm_put_task_info(ti);
241+
}
242+
}
235243

236244
coredump->adev = adev;
237245

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

Lines changed: 115 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -513,8 +513,14 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
513513
bo = bo_base->bo;
514514

515515
if (dma_resv_locking_ctx(bo->tbo.base.resv) != ticket) {
516-
pr_warn_ratelimited("Evicted user BO is not reserved in pid %d\n",
517-
vm->task_info.pid);
516+
struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm);
517+
518+
pr_warn_ratelimited("Evicted user BO is not reserved\n");
519+
if (ti) {
520+
pr_warn_ratelimited("pid %d\n", ti->pid);
521+
amdgpu_vm_put_task_info(ti);
522+
}
523+
518524
return -EINVAL;
519525
}
520526

@@ -2221,6 +2227,108 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
22212227
return dma_fence_wait_timeout(vm->last_unlocked, true, timeout);
22222228
}
22232229

2230+
static void amdgpu_vm_destroy_task_info(struct kref *kref)
2231+
{
2232+
struct amdgpu_task_info *ti = container_of(kref, struct amdgpu_task_info, refcount);
2233+
2234+
kfree(ti);
2235+
}
2236+
2237+
static inline struct amdgpu_vm *
2238+
amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
2239+
{
2240+
struct amdgpu_vm *vm;
2241+
unsigned long flags;
2242+
2243+
xa_lock_irqsave(&adev->vm_manager.pasids, flags);
2244+
vm = xa_load(&adev->vm_manager.pasids, pasid);
2245+
xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
2246+
2247+
return vm;
2248+
}
2249+
2250+
/**
2251+
* amdgpu_vm_put_task_info - reference down the vm task_info ptr
2252+
*
2253+
* @task_info: task_info struct under discussion.
2254+
*
2255+
* frees the vm task_info ptr at the last put
2256+
*/
2257+
void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info)
2258+
{
2259+
kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
2260+
}
2261+
2262+
/**
2263+
* amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
2264+
*
2265+
* @vm: VM to get info from
2266+
*
2267+
* Returns the reference counted task_info structure, which must be
2268+
* referenced down with amdgpu_vm_put_task_info.
2269+
*/
2270+
struct amdgpu_task_info *
2271+
amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
2272+
{
2273+
struct amdgpu_task_info *ti = NULL;
2274+
2275+
if (vm) {
2276+
ti = vm->task_info;
2277+
kref_get(&vm->task_info->refcount);
2278+
}
2279+
2280+
return ti;
2281+
}
2282+
2283+
/**
2284+
* amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
2285+
*
2286+
* @adev: drm device pointer
2287+
* @pasid: PASID identifier for VM
2288+
*
2289+
* Returns the reference counted task_info structure, which must be
2290+
* referenced down with amdgpu_vm_put_task_info.
2291+
*/
2292+
struct amdgpu_task_info *
2293+
amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
2294+
{
2295+
return amdgpu_vm_get_task_info_vm(
2296+
amdgpu_vm_get_vm_from_pasid(adev, pasid));
2297+
}
2298+
2299+
static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
2300+
{
2301+
vm->task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL);
2302+
if (!vm->task_info)
2303+
return -ENOMEM;
2304+
2305+
kref_init(&vm->task_info->refcount);
2306+
return 0;
2307+
}
2308+
2309+
/**
2310+
* amdgpu_vm_set_task_info - Sets VMs task info.
2311+
*
2312+
* @vm: vm for which to set the info
2313+
*/
2314+
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
2315+
{
2316+
if (!vm->task_info)
2317+
return;
2318+
2319+
if (vm->task_info->pid == current->pid)
2320+
return;
2321+
2322+
vm->task_info->pid = current->pid;
2323+
get_task_comm(vm->task_info->task_name, current);
2324+
2325+
if (current->group_leader->mm != current->mm)
2326+
return;
2327+
2328+
vm->task_info->tgid = current->group_leader->pid;
2329+
get_task_comm(vm->task_info->process_name, current->group_leader);
2330+
}
2331+
22242332
/**
22252333
* amdgpu_vm_init - initialize a vm instance
22262334
*
@@ -2306,6 +2414,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
23062414
if (r)
23072415
goto error_free_root;
23082416

2417+
r = amdgpu_vm_create_task_info(vm);
2418+
if (r)
2419+
DRM_DEBUG("Failed to create task info for VM\n");
2420+
23092421
amdgpu_bo_unreserve(vm->root.bo);
23102422
amdgpu_bo_unref(&root_bo);
23112423

@@ -2427,6 +2539,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
24272539

24282540
root = amdgpu_bo_ref(vm->root.bo);
24292541
amdgpu_bo_reserve(root, true);
2542+
amdgpu_vm_put_task_info(vm->task_info);
24302543
amdgpu_vm_set_pasid(adev, vm, 0);
24312544
dma_fence_wait(vm->last_unlocked, false);
24322545
dma_fence_put(vm->last_unlocked);
@@ -2583,48 +2696,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
25832696
return 0;
25842697
}
25852698

2586-
/**
2587-
* amdgpu_vm_get_task_info - Extracts task info for a PASID.
2588-
*
2589-
* @adev: drm device pointer
2590-
* @pasid: PASID identifier for VM
2591-
* @task_info: task_info to fill.
2592-
*/
2593-
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
2594-
struct amdgpu_task_info *task_info)
2595-
{
2596-
struct amdgpu_vm *vm;
2597-
unsigned long flags;
2598-
2599-
xa_lock_irqsave(&adev->vm_manager.pasids, flags);
2600-
2601-
vm = xa_load(&adev->vm_manager.pasids, pasid);
2602-
if (vm)
2603-
*task_info = vm->task_info;
2604-
2605-
xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
2606-
}
2607-
2608-
/**
2609-
* amdgpu_vm_set_task_info - Sets VMs task info.
2610-
*
2611-
* @vm: vm for which to set the info
2612-
*/
2613-
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
2614-
{
2615-
if (vm->task_info.pid)
2616-
return;
2617-
2618-
vm->task_info.pid = current->pid;
2619-
get_task_comm(vm->task_info.task_name, current);
2620-
2621-
if (current->group_leader->mm != current->mm)
2622-
return;
2623-
2624-
vm->task_info.tgid = current->group_leader->pid;
2625-
get_task_comm(vm->task_info.process_name, current->group_leader);
2626-
}
2627-
26282699
/**
26292700
* amdgpu_vm_handle_fault - graceful handling of VM faults.
26302701
* @adev: amdgpu device pointer

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,11 @@ struct amdgpu_vm_pte_funcs {
203203
};
204204

205205
struct amdgpu_task_info {
206-
char process_name[TASK_COMM_LEN];
207-
char task_name[TASK_COMM_LEN];
208-
pid_t pid;
209-
pid_t tgid;
206+
char process_name[TASK_COMM_LEN];
207+
char task_name[TASK_COMM_LEN];
208+
pid_t pid;
209+
pid_t tgid;
210+
struct kref refcount;
210211
};
211212

212213
/**
@@ -370,7 +371,7 @@ struct amdgpu_vm {
370371
uint64_t pd_phys_addr;
371372

372373
/* Some basic info about the task */
373-
struct amdgpu_task_info task_info;
374+
struct amdgpu_task_info *task_info;
374375

375376
/* Store positions of group of BOs */
376377
struct ttm_lru_bulk_move lru_bulk_move;
@@ -511,8 +512,14 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
511512
struct amdgpu_job *job);
512513
void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
513514

514-
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
515-
struct amdgpu_task_info *task_info);
515+
struct amdgpu_task_info *
516+
amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
517+
518+
struct amdgpu_task_info *
519+
amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
520+
521+
void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
522+
516523
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
517524
u32 vmid, u32 node_id, uint64_t addr,
518525
bool write_fault);

drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -973,7 +973,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
973973
trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
974974
min(nptes, 32u), dst, incr,
975975
upd_flags,
976-
vm->task_info.tgid,
976+
vm->task_info ? vm->task_info->tgid : 0,
977977
vm->immediate.fence_context);
978978
amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
979979
cursor.level, pe_start, dst,

drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
105105
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
106106
bool retry_fault = !!(entry->src_data[1] & 0x80);
107107
bool write_fault = !!(entry->src_data[1] & 0x20);
108-
struct amdgpu_task_info task_info;
108+
struct amdgpu_task_info *task_info;
109109
uint32_t status = 0;
110110
u64 addr;
111111

@@ -157,18 +157,22 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
157157
if (!printk_ratelimit())
158158
return 0;
159159

160-
memset(&task_info, 0, sizeof(struct amdgpu_task_info));
161-
amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
162-
163160
dev_err(adev->dev,
164-
"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
161+
"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
165162
entry->vmid_src ? "mmhub" : "gfxhub",
166-
entry->src_id, entry->ring_id, entry->vmid,
167-
entry->pasid, task_info.process_name, task_info.tgid,
168-
task_info.task_name, task_info.pid);
163+
entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
164+
task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
165+
if (task_info) {
166+
dev_err(adev->dev,
167+
" in process %s pid %d thread %s pid %d\n",
168+
task_info->process_name, task_info->tgid,
169+
task_info->task_name, task_info->pid);
170+
amdgpu_vm_put_task_info(task_info);
171+
}
172+
169173
dev_err(adev->dev, " in page starting at address 0x%016llx from client 0x%x (%s)\n",
170-
addr, entry->client_id,
171-
soc15_ih_clientid_name[entry->client_id]);
174+
addr, entry->client_id,
175+
soc15_ih_clientid_name[entry->client_id]);
172176

173177
if (!amdgpu_sriov_vf(adev))
174178
hub->vmhub_funcs->print_l2_protection_fault_status(adev,

0 commit comments

Comments
 (0)