Skip to content

Commit 183bcca

Browse files
committed
drm: Create a task info option for wedge events
When a device get wedged, it might be caused by a guilty application. For userspace, knowing which task was involved can be useful for some situations, like for implementing a policy, logs or for giving a chance for the compositor to let the user know what task was involved in the problem. This is an optional argument, when the task info is not available, the PID and TASK string won't appear in the event string. Sometimes just the PID isn't enough giving that the task might be already dead by the time userspace will try to check what was this PID's name, so to make the life easier also notify what's the task's name in the user event. Acked-by: Rodrigo Vivi <[email protected]> Reviewed-by: Krzysztof Karas <[email protected]> Reviewed-by: Raag Jadav <[email protected]> Acked-by: Christian König <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: André Almeida <[email protected]>
1 parent 3bfd1af commit 183bcca

File tree

7 files changed

+34
-9
lines changed

7 files changed

+34
-9
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6364,7 +6364,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
63646364
atomic_set(&adev->reset_domain->reset_res, r);
63656365

63666366
if (!r)
6367-
drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
6367+
drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
63686368

63696369
return r;
63706370
}

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
164164
if (amdgpu_ring_sched_ready(ring))
165165
drm_sched_start(&ring->sched, 0);
166166
dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
167-
drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
167+
drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
168168
goto exit;
169169
}
170170
dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);

drivers/gpu/drm/drm_drv.c

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <linux/moduleparam.h>
3636
#include <linux/mount.h>
3737
#include <linux/pseudo_fs.h>
38+
#include <linux/sched.h>
3839
#include <linux/slab.h>
3940
#include <linux/sprintf.h>
4041
#include <linux/srcu.h>
@@ -539,10 +540,15 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
539540
}
540541
}
541542

543+
#define WEDGE_STR_LEN 32
544+
#define PID_STR_LEN 15
545+
#define COMM_STR_LEN (TASK_COMM_LEN + 5)
546+
542547
/**
543548
* drm_dev_wedged_event - generate a device wedged uevent
544549
* @dev: DRM device
545550
* @method: method(s) to be used for recovery
551+
* @info: optional information about the guilty task
546552
*
547553
* This generates a device wedged uevent for the DRM device specified by @dev.
548554
* Recovery @method\(s) of choice will be sent in the uevent environment as
@@ -555,13 +561,13 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
555561
*
556562
* Returns: 0 on success, negative error code otherwise.
557563
*/
558-
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
564+
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
565+
struct drm_wedge_task_info *info)
559566
{
567+
char event_string[WEDGE_STR_LEN], pid_string[PID_STR_LEN], comm_string[COMM_STR_LEN];
568+
char *envp[] = { event_string, NULL, NULL, NULL };
560569
const char *recovery = NULL;
561570
unsigned int len, opt;
562-
/* Event string length up to 28+ characters with available methods */
563-
char event_string[32];
564-
char *envp[] = { event_string, NULL };
565571

566572
len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
567573

@@ -583,6 +589,13 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
583589
drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
584590
"but recovered through reset" : "needs recovery");
585591

592+
if (info && (info->comm[0] != '\0') && (info->pid >= 0)) {
593+
snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
594+
snprintf(comm_string, sizeof(comm_string), "TASK=%s", info->comm);
595+
envp[1] = pid_string;
596+
envp[2] = comm_string;
597+
}
598+
586599
return kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
587600
}
588601
EXPORT_SYMBOL(drm_dev_wedged_event);

drivers/gpu/drm/i915/gt/intel_reset.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1448,7 +1448,8 @@ static void intel_gt_reset_global(struct intel_gt *gt,
14481448
kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
14491449
else
14501450
drm_dev_wedged_event(&gt->i915->drm,
1451-
DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
1451+
DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
1452+
NULL);
14521453
}
14531454

14541455
/**

drivers/gpu/drm/xe/xe_device.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1168,7 +1168,8 @@ void xe_device_declare_wedged(struct xe_device *xe)
11681168

11691169
/* Notify userspace of wedged device */
11701170
drm_dev_wedged_event(&xe->drm,
1171-
DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
1171+
DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
1172+
NULL);
11721173
}
11731174

11741175
for_each_gt(gt, xe, id)

include/drm/drm_device.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <linux/kref.h>
66
#include <linux/mutex.h>
77
#include <linux/idr.h>
8+
#include <linux/sched.h>
89

910
#include <drm/drm_mode_config.h>
1011

@@ -30,6 +31,14 @@ struct pci_controller;
3031
#define DRM_WEDGE_RECOVERY_REBIND BIT(1) /* unbind + bind driver */
3132
#define DRM_WEDGE_RECOVERY_BUS_RESET BIT(2) /* unbind + reset bus device + bind */
3233

34+
/**
35+
* struct drm_wedge_task_info - information about the guilty task of a wedge dev
36+
*/
37+
struct drm_wedge_task_info {
38+
pid_t pid;
39+
char comm[TASK_COMM_LEN];
40+
};
41+
3342
/**
3443
* enum switch_power_state - power state of drm device
3544
*/

include/drm/drm_drv.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,8 @@ void drm_put_dev(struct drm_device *dev);
487487
bool drm_dev_enter(struct drm_device *dev, int *idx);
488488
void drm_dev_exit(int idx);
489489
void drm_dev_unplug(struct drm_device *dev);
490-
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method);
490+
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
491+
struct drm_wedge_task_info *info);
491492

492493
/**
493494
* drm_dev_is_unplugged - is a DRM device unplugged

0 commit comments

Comments
 (0)