Skip to content

Commit b1fb6b8

Browse files
committed
Merge tag 'amd-drm-fixes-6.0-2022-08-17' of https://gitlab.freedesktop.org/agd5f/linux into drm-fixes
amd-drm-fixes-6.0-2022-08-17: amdgpu: - Revert some DML stack changes - Rounding fixes in KFD allocations - atombios vram info table parsing fix - DCN 3.1.4 fixes - Clockgating fixes for various new IPs - SMU 13.0.4 fixes - DCN 3.1.4 FP fixes - TMDS fixes for YCbCr420 4k modes - DCN 3.2.x fixes - USB 4 fixes - SMU 13.0 fixes - SMU driver unload memory leak fixes - Display orientation fix - Regression fix for generic fbdev conversion - SDMA 6.x fixes - SR-IOV fixes - IH 6.x fixes - Use after free fix in bo list handling - Revert pipe1 support - XGMI hive reset fix amdkfd: - Fix potential crach in kfd_create_indirect_link_prop() Signed-off-by: Dave Airlie <[email protected]> From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 2ae6ab9 + 085292c commit b1fb6b8

File tree

108 files changed

+1758
-1184
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+1758
-1184
lines changed

drivers/gpu/drm/amd/amdgpu/aldebaran.c

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -148,38 +148,30 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
148148
struct amdgpu_reset_context *reset_context)
149149
{
150150
struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
151+
struct list_head *reset_device_list = reset_context->reset_device_list;
151152
struct amdgpu_device *tmp_adev = NULL;
152-
struct list_head reset_device_list;
153153
int r = 0;
154154

155155
dev_dbg(adev->dev, "aldebaran perform hw reset\n");
156+
157+
if (reset_device_list == NULL)
158+
return -EINVAL;
159+
156160
if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
157161
reset_context->hive == NULL) {
158162
/* Wrong context, return error */
159163
return -EINVAL;
160164
}
161165

162-
INIT_LIST_HEAD(&reset_device_list);
163-
if (reset_context->hive) {
164-
list_for_each_entry (tmp_adev,
165-
&reset_context->hive->device_list,
166-
gmc.xgmi.head)
167-
list_add_tail(&tmp_adev->reset_list,
168-
&reset_device_list);
169-
} else {
170-
list_add_tail(&reset_context->reset_req_dev->reset_list,
171-
&reset_device_list);
172-
}
173-
174-
list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
166+
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
175167
mutex_lock(&tmp_adev->reset_cntl->reset_lock);
176168
tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
177169
}
178170
/*
179171
* Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
180172
* them together so that they can be completed asynchronously on multiple nodes
181173
*/
182-
list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
174+
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
183175
/* For XGMI run all resets in parallel to speed up the process */
184176
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
185177
if (!queue_work(system_unbound_wq,
@@ -197,7 +189,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
197189

198190
/* For XGMI wait for all resets to complete before proceed */
199191
if (!r) {
200-
list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
192+
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
201193
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
202194
flush_work(&tmp_adev->reset_cntl->reset_work);
203195
r = tmp_adev->asic_reset_res;
@@ -207,7 +199,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
207199
}
208200
}
209201

210-
list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
202+
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
211203
mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
212204
tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
213205
}
@@ -339,30 +331,21 @@ static int
339331
aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
340332
struct amdgpu_reset_context *reset_context)
341333
{
334+
struct list_head *reset_device_list = reset_context->reset_device_list;
342335
struct amdgpu_device *tmp_adev = NULL;
343-
struct list_head reset_device_list;
344336
int r;
345337

338+
if (reset_device_list == NULL)
339+
return -EINVAL;
340+
346341
if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
347342
IP_VERSION(13, 0, 2) &&
348343
reset_context->hive == NULL) {
349344
/* Wrong context, return error */
350345
return -EINVAL;
351346
}
352347

353-
INIT_LIST_HEAD(&reset_device_list);
354-
if (reset_context->hive) {
355-
list_for_each_entry (tmp_adev,
356-
&reset_context->hive->device_list,
357-
gmc.xgmi.head)
358-
list_add_tail(&tmp_adev->reset_list,
359-
&reset_device_list);
360-
} else {
361-
list_add_tail(&reset_context->reset_req_dev->reset_list,
362-
&reset_device_list);
363-
}
364-
365-
list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
348+
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
366349
dev_info(tmp_adev->dev,
367350
"GPU reset succeeded, trying to resume\n");
368351
r = aldebaran_mode2_restore_ip(tmp_adev);

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ enum amdgpu_kiq_irq {
317317
AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
318318
AMDGPU_CP_KIQ_IRQ_LAST
319319
};
320-
320+
#define SRIOV_USEC_TIMEOUT 1200000 /* wait 12 * 100ms for SRIOV */
321321
#define MAX_KIQ_REG_WAIT 5000 /* in usecs, 5ms */
322322
#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */
323323
#define MAX_KIQ_REG_TRY 1000

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ struct amdgpu_amdkfd_fence {
9696
struct amdgpu_kfd_dev {
9797
struct kfd_dev *dev;
9898
uint64_t vram_used;
99+
uint64_t vram_used_aligned;
99100
bool init_complete;
100101
struct work_struct reset_work;
101102
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@
4040
#define AMDGPU_USERPTR_RESTORE_DELAY_MS 1
4141

4242
/*
43-
* Align VRAM allocations to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
43+
* Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
4444
* BO chunk
4545
*/
46-
#define VRAM_ALLOCATION_ALIGN (1 << 21)
46+
#define VRAM_AVAILABLITY_ALIGN (1 << 21)
4747

4848
/* Impose limit on how much memory KFD can use */
4949
static struct {
@@ -149,7 +149,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
149149
* to avoid fragmentation caused by 4K allocations in the tail
150150
* 2M BO chunk.
151151
*/
152-
vram_needed = ALIGN(size, VRAM_ALLOCATION_ALIGN);
152+
vram_needed = size;
153153
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
154154
system_mem_needed = size;
155155
} else if (!(alloc_flag &
@@ -182,8 +182,10 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
182182
*/
183183
WARN_ONCE(vram_needed && !adev,
184184
"adev reference can't be null when vram is used");
185-
if (adev)
185+
if (adev) {
186186
adev->kfd.vram_used += vram_needed;
187+
adev->kfd.vram_used_aligned += ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
188+
}
187189
kfd_mem_limit.system_mem_used += system_mem_needed;
188190
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
189191

@@ -203,8 +205,10 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
203205
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
204206
WARN_ONCE(!adev,
205207
"adev reference can't be null when alloc mem flags vram is set");
206-
if (adev)
207-
adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
208+
if (adev) {
209+
adev->kfd.vram_used -= size;
210+
adev->kfd.vram_used_aligned -= ALIGN(size, VRAM_AVAILABLITY_ALIGN);
211+
}
208212
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
209213
kfd_mem_limit.system_mem_used -= size;
210214
} else if (!(alloc_flag &
@@ -1608,15 +1612,14 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
16081612
uint64_t reserved_for_pt =
16091613
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
16101614
size_t available;
1611-
16121615
spin_lock(&kfd_mem_limit.mem_limit_lock);
16131616
available = adev->gmc.real_vram_size
1614-
- adev->kfd.vram_used
1617+
- adev->kfd.vram_used_aligned
16151618
- atomic64_read(&adev->vram_pin_size)
16161619
- reserved_for_pt;
16171620
spin_unlock(&kfd_mem_limit.mem_limit_lock);
16181621

1619-
return ALIGN_DOWN(available, VRAM_ALLOCATION_ALIGN);
1622+
return ALIGN_DOWN(available, VRAM_AVAILABLITY_ALIGN);
16201623
}
16211624

16221625
int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(

drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device *adev,
314314
mem_channel_number = vram_info->v30.channel_num;
315315
mem_channel_width = vram_info->v30.channel_width;
316316
if (vram_width)
317-
*vram_width = mem_channel_number * mem_channel_width;
317+
*vram_width = mem_channel_number * (1 << mem_channel_width);
318318
break;
319319
default:
320320
return -EINVAL;

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -837,16 +837,12 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
837837
continue;
838838

839839
r = amdgpu_vm_bo_update(adev, bo_va, false);
840-
if (r) {
841-
mutex_unlock(&p->bo_list->bo_list_mutex);
840+
if (r)
842841
return r;
843-
}
844842

845843
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
846-
if (r) {
847-
mutex_unlock(&p->bo_list->bo_list_mutex);
844+
if (r)
848845
return r;
849-
}
850846
}
851847

852848
r = amdgpu_vm_handle_moved(adev, vm);

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,7 +1705,7 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
17051705
{
17061706
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
17071707
char reg_offset[11];
1708-
uint32_t *new, *tmp = NULL;
1708+
uint32_t *new = NULL, *tmp = NULL;
17091709
int ret, i = 0, len = 0;
17101710

17111711
do {
@@ -1747,7 +1747,8 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
17471747
ret = size;
17481748

17491749
error_free:
1750-
kfree(tmp);
1750+
if (tmp != new)
1751+
kfree(tmp);
17511752
kfree(new);
17521753
return ret;
17531754
}

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4742,6 +4742,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
47424742
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
47434743
reset_list);
47444744
amdgpu_reset_reg_dumps(tmp_adev);
4745+
4746+
reset_context->reset_device_list = device_list_handle;
47454747
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
47464748
/* If reset handler not implemented, continue; otherwise return */
47474749
if (r == -ENOSYS)

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,6 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
272272
/* Signal all jobs not yet scheduled */
273273
for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
274274
struct drm_sched_rq *rq = &sched->sched_rq[i];
275-
276-
if (!rq)
277-
continue;
278-
279275
spin_lock(&rq->lock);
280276
list_for_each_entry(s_entity, &rq->entities, list) {
281277
while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {

drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ struct amdgpu_reset_context {
3737
struct amdgpu_device *reset_req_dev;
3838
struct amdgpu_job *job;
3939
struct amdgpu_hive_info *hive;
40+
struct list_head *reset_device_list;
4041
unsigned long flags;
4142
};
4243

0 commit comments

Comments
 (0)