Skip to content

Commit bfbe174

Browse files
committed
Merge tag 'amd-drm-next-5.8-2020-05-19' of git://people.freedesktop.org/~agd5f/linux into drm-next
amd-drm-next-5.8-2020-05-19: amdgpu: - Improved handling for CTF (Critical Thermal Fault) situations - Clarify AC/DC mode switches - SR-IOV fixes - XGMI fixes for RAS - Misc cleanups - Add autodump debugfs node to aid in GPU hang debugging UAPI: - Add a MEM_SYNC IB flag for handling proper acquire memory semantics if UMDs expect the kernel to handle this Used by AMDVLK: https://github.com/GPUOpen-Drivers/pal/blob/dev/src/core/os/amdgpu/amdgpuQueue.cpp#L1262 Signed-off-by: Dave Airlie <[email protected]> From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 1493bdd + 43c8546 commit bfbe174

32 files changed

+775
-313
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,8 @@ struct amdgpu_device {
989989
char product_number[16];
990990
char product_name[32];
991991
char serial[16];
992+
993+
struct amdgpu_autodump autodump;
992994
};
993995

994996
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
#include <linux/pci.h>
2828
#include <linux/uaccess.h>
2929
#include <linux/pm_runtime.h>
30-
30+
#include <linux/poll.h>
3131
#include <drm/drm_debugfs.h>
3232

3333
#include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
7474
return 0;
7575
}
7676

77+
int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
78+
{
79+
#if defined(CONFIG_DEBUG_FS)
80+
unsigned long timeout = 600 * HZ;
81+
int ret;
82+
83+
wake_up_interruptible(&adev->autodump.gpu_hang);
84+
85+
ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
86+
if (ret == 0) {
87+
pr_err("autodump: timeout, move on to gpu recovery\n");
88+
return -ETIMEDOUT;
89+
}
90+
#endif
91+
return 0;
92+
}
93+
7794
#if defined(CONFIG_DEBUG_FS)
7895

96+
static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
97+
{
98+
struct amdgpu_device *adev = inode->i_private;
99+
int ret;
100+
101+
file->private_data = adev;
102+
103+
mutex_lock(&adev->lock_reset);
104+
if (adev->autodump.dumping.done) {
105+
reinit_completion(&adev->autodump.dumping);
106+
ret = 0;
107+
} else {
108+
ret = -EBUSY;
109+
}
110+
mutex_unlock(&adev->lock_reset);
111+
112+
return ret;
113+
}
114+
115+
static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
116+
{
117+
struct amdgpu_device *adev = file->private_data;
118+
119+
complete_all(&adev->autodump.dumping);
120+
return 0;
121+
}
122+
123+
static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
124+
{
125+
struct amdgpu_device *adev = file->private_data;
126+
127+
poll_wait(file, &adev->autodump.gpu_hang, poll_table);
128+
129+
if (adev->in_gpu_reset)
130+
return POLLIN | POLLRDNORM | POLLWRNORM;
131+
132+
return 0;
133+
}
134+
135+
static const struct file_operations autodump_debug_fops = {
136+
.owner = THIS_MODULE,
137+
.open = amdgpu_debugfs_autodump_open,
138+
.poll = amdgpu_debugfs_autodump_poll,
139+
.release = amdgpu_debugfs_autodump_release,
140+
};
141+
142+
static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
143+
{
144+
init_completion(&adev->autodump.dumping);
145+
complete_all(&adev->autodump.dumping);
146+
init_waitqueue_head(&adev->autodump.gpu_hang);
147+
148+
debugfs_create_file("amdgpu_autodump", 0600,
149+
adev->ddev->primary->debugfs_root,
150+
adev, &autodump_debug_fops);
151+
}
152+
79153
/**
80154
* amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
81155
*
@@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
14341508

14351509
amdgpu_ras_debugfs_create_all(adev);
14361510

1511+
amdgpu_debugfs_autodump_init(adev);
1512+
14371513
return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
14381514
ARRAY_SIZE(amdgpu_debugfs_list));
14391515
}

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
3131
unsigned num_files;
3232
};
3333

34+
struct amdgpu_autodump {
35+
struct completion dumping;
36+
struct wait_queue_head gpu_hang;
37+
};
38+
3439
int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
3540
int amdgpu_debugfs_init(struct amdgpu_device *adev);
3641
void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
4045
int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
4146
int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
4247
int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
48+
int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
39273927
int i, r = 0;
39283928
bool need_full_reset = *need_full_reset_arg;
39293929

3930+
amdgpu_debugfs_wait_dump(adev);
3931+
39303932
/* block all schedulers and reset given job's ring */
39313933
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
39323934
struct amdgpu_ring *ring = adev->rings[i];

drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,3 +1188,13 @@ int amdgpu_dpm_set_df_cstate(struct amdgpu_device *adev,
11881188

11891189
return ret;
11901190
}
1191+
1192+
int amdgpu_dpm_allow_xgmi_power_down(struct amdgpu_device *adev, bool en)
1193+
{
1194+
struct smu_context *smu = &adev->smu;
1195+
1196+
if (is_support_sw_smu(adev))
1197+
return smu_allow_xgmi_power_down(smu, en);
1198+
1199+
return 0;
1200+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,4 +538,6 @@ int amdgpu_dpm_baco_enter(struct amdgpu_device *adev);
538538
int amdgpu_dpm_set_df_cstate(struct amdgpu_device *adev,
539539
uint32_t cstate);
540540

541+
int amdgpu_dpm_allow_xgmi_power_down(struct amdgpu_device *adev, bool en);
542+
541543
#endif

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,10 @@
8686
* - 3.35.0 - Add drm_amdgpu_info_device::tcc_disabled_mask
8787
* - 3.36.0 - Allow reading more status registers on si/cik
8888
* - 3.37.0 - L2 is invalidated before SDMA IBs, needed for correctness
89+
* - 3.38.0 - Add AMDGPU_IB_FLAG_EMIT_MEM_SYNC
8990
*/
9091
#define KMS_DRIVER_MAJOR 3
91-
#define KMS_DRIVER_MINOR 37
92+
#define KMS_DRIVER_MINOR 38
9293
#define KMS_DRIVER_PATCHLEVEL 0
9394

9495
int amdgpu_vram_limit = 0;

drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
189189
dma_fence_put(tmp);
190190
}
191191

192+
if ((ib->flags & AMDGPU_IB_FLAG_EMIT_MEM_SYNC) && ring->funcs->emit_mem_sync)
193+
ring->funcs->emit_mem_sync(ring);
194+
192195
if (ring->funcs->insert_start)
193196
ring->funcs->insert_start(ring);
194197

0 commit comments

Comments
 (0)