Skip to content

Commit 1b00143

Browse files
PhilipYangAalexdeucher
authored andcommitted
drm/amdgpu: Optimize gfx v9 GPU page fault handling
After GPU page fault, there are lots of page fault interrupts generated at short period even with CAM filter enabled because the fault address is different. Each page fault copy to KFD ih fifo to send event to user space by KFD interrupt worker, this could cause KFD ih fifo overflow while other processes generate events at same time. KFD process is aborted after GPU page fault, we only need one GPU page fault interrupt sent to KFD ih fifo to send memory exception event to user space. Incease KFD ih fifo size to 2 times of IH primary ring size, to handle the burst events case. This patch handle the gfx v9 path, cover retry on/off and CAM filter on/off cases. Signed-off-by: Philip Yang <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent f607b2b commit 1b00143

File tree

5 files changed

+84
-1
lines changed

5 files changed

+84
-1
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,9 @@ void kgd2kfd_unlock_kfd(void);
433433
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
434434
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
435435
bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
436+
bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
437+
bool retry_fault);
438+
436439
#else
437440
static inline int kgd2kfd_init(void)
438441
{
@@ -518,5 +521,12 @@ static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
518521
{
519522
return false;
520523
}
524+
525+
static inline bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
526+
bool retry_fault)
527+
{
528+
return false;
529+
}
530+
521531
#endif
522532
#endif /* AMDGPU_AMDKFD_H_INCLUDED */

drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
623623
}
624624
}
625625

626+
if (kgd2kfd_vmfault_fast_path(adev, entry, retry_fault))
627+
return 1;
628+
626629
if (!printk_ratelimit())
627630
return 0;
628631

drivers/gpu/drm/amd/amdkfd/kfd_device.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,6 +1521,73 @@ bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
15211521
return kfd_compute_active(node);
15221522
}
15231523

1524+
/**
1525+
* kgd2kfd_vmfault_fast_path() - KFD vm page fault interrupt handling fast path for gmc v9
1526+
* @adev: amdgpu device
1527+
* @entry: vm fault interrupt vector
1528+
* @retry_fault: if this is retry fault
1529+
*
1530+
* retry fault -
1531+
* with CAM enabled, adev primary ring
1532+
* | gmc_v9_0_process_interrupt()
1533+
* adev soft_ring
1534+
* | gmc_v9_0_process_interrupt() worker failed to recover page fault
1535+
* KFD node ih_fifo
1536+
* | KFD interrupt_wq worker
1537+
* kfd_signal_vm_fault_event
1538+
*
1539+
* without CAM, adev primary ring1
1540+
* | gmc_v9_0_process_interrupt worker failed to recvoer page fault
1541+
* KFD node ih_fifo
1542+
* | KFD interrupt_wq worker
1543+
* kfd_signal_vm_fault_event
1544+
*
1545+
* no-retry fault -
1546+
* adev primary ring
1547+
* | gmc_v9_0_process_interrupt()
1548+
* KFD node ih_fifo
1549+
* | KFD interrupt_wq worker
1550+
* kfd_signal_vm_fault_event
1551+
*
1552+
* fast path - After kfd_signal_vm_fault_event, gmc_v9_0_process_interrupt drop the page fault
1553+
* of same process, don't copy interrupt to KFD node ih_fifo.
1554+
* With gdb debugger enabled, need convert the retry fault to no-retry fault for
1555+
* debugger, cannot use the fast path.
1556+
*
1557+
* Return:
1558+
* true - use the fast path to handle this fault
1559+
* false - use normal path to handle it
1560+
*/
1561+
bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
1562+
bool retry_fault)
1563+
{
1564+
struct kfd_process *p;
1565+
u32 cam_index;
1566+
1567+
if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) {
1568+
p = kfd_lookup_process_by_pasid(entry->pasid);
1569+
if (!p)
1570+
return true;
1571+
1572+
if (p->gpu_page_fault && !p->debug_trap_enabled) {
1573+
if (retry_fault && adev->irq.retry_cam_enabled) {
1574+
cam_index = entry->src_data[2] & 0x3ff;
1575+
WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
1576+
}
1577+
1578+
kfd_unref_process(p);
1579+
return true;
1580+
}
1581+
1582+
/*
1583+
* This is the first page fault, set flag and then signal user space
1584+
*/
1585+
p->gpu_page_fault = true;
1586+
kfd_unref_process(p);
1587+
}
1588+
return false;
1589+
}
1590+
15241591
#if defined(CONFIG_DEBUG_FS)
15251592

15261593
/* This function will send a package to HIQ to hang the HWS

drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
#include <linux/kfifo.h>
4747
#include "kfd_priv.h"
4848

49-
#define KFD_IH_NUM_ENTRIES 8192
49+
#define KFD_IH_NUM_ENTRIES 16384
5050

5151
static void interrupt_wq(struct work_struct *);
5252

drivers/gpu/drm/amd/amdkfd/kfd_priv.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,9 @@ struct kfd_process {
10031003
struct semaphore runtime_enable_sema;
10041004
bool is_runtime_retry;
10051005
struct kfd_runtime_info runtime_info;
1006+
1007+
/* if gpu page fault sent to KFD */
1008+
bool gpu_page_fault;
10061009
};
10071010

10081011
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */

0 commit comments

Comments
 (0)