Skip to content

Commit 6ae9e1a

Browse files
mukjoshialexdeucher
authored andcommitted
drm/amdkfd: Update logic for CU occupancy calculations
Currently, the code uses the IH_VMID_X_LUT register to map a queue's vmid to the corresponding PASID. This logic is racy since CP can update the VMID-PASID mapping anytime especially when there are more processes than number of vmids. Update the logic to calculate CU occupancy by matching doorbell offset of the queue with valid wave counts against the process's queues. Signed-off-by: Mukul Joshi <[email protected]> Reviewed-by: Harish Kasiviswanathan <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent e1d27f7 commit 6ae9e1a

File tree

6 files changed

+89
-65
lines changed

6 files changed

+89
-65
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c

Lines changed: 42 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
950950
* @inst: xcc's instance number on a multi-XCC setup
951951
*/
952952
static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
953-
int *wave_cnt, int *vmid, uint32_t inst)
953+
struct kfd_cu_occupancy *queue_cnt, uint32_t inst)
954954
{
955955
int pipe_idx;
956956
int queue_slot;
957957
unsigned int reg_val;
958-
958+
unsigned int wave_cnt;
959959
/*
960960
* Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
961961
* parameters to read out waves in flight. Get VMID if there are
962962
* non-zero waves in flight.
963963
*/
964-
*vmid = 0xFF;
965-
*wave_cnt = 0;
966964
pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
967965
queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
968966
soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
969-
reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
970-
queue_slot);
971-
*wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
972-
if (*wave_cnt != 0)
973-
*vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
974-
CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
967+
reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst,
968+
mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
969+
wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
970+
if (wave_cnt != 0) {
971+
queue_cnt->wave_cnt += wave_cnt;
972+
queue_cnt->doorbell_off =
973+
(RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) &
974+
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
975+
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
976+
}
975977
}
976978

977979
/**
@@ -981,9 +983,8 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
981983
* or more queues running and submitting waves to compute units.
982984
*
983985
* @adev: Handle of device from which to get number of waves in flight
984-
* @pasid: Identifies the process for which this query call is invoked
985-
* @pasid_wave_cnt: Output parameter updated with number of waves in flight that
986-
* belong to process with given pasid
986+
* @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset
987+
* for comparison later.
987988
* @max_waves_per_cu: Output parameter updated with maximum number of waves
988989
* possible per Compute Unit
989990
* @inst: xcc's instance number on a multi-XCC setup
@@ -1011,30 +1012,24 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
10111012
* number of waves that are in flight for the queue at specified index. The
10121013
* index ranges from 0 to 7.
10131014
*
1014-
* If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
1015-
* of the wave(s).
1015+
* If non-zero waves are in flight, store the corresponding doorbell offset
1016+
* of the queue, along with the wave count.
10161017
*
1017-
* Determine if VMID from above step maps to pasid provided as parameter. If
1018-
* it matches agrregate the wave count. That the VMID will not match pasid is
1019-
* a normal condition i.e. a device is expected to support multiple queues
1020-
* from multiple proceses.
1018+
* Determine if the queue belongs to the process by comparing the doorbell
1019+
* offset against the process's queues. If it matches, aggregate the wave
1020+
* count for the process.
10211021
*
10221022
* Reading registers referenced above involves programming GRBM appropriately
10231023
*/
1024-
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
1025-
int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst)
1024+
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
1025+
struct kfd_cu_occupancy *cu_occupancy,
1026+
int *max_waves_per_cu, uint32_t inst)
10261027
{
10271028
int qidx;
1028-
int vmid;
10291029
int se_idx;
1030-
int sh_idx;
10311030
int se_cnt;
1032-
int sh_cnt;
1033-
int wave_cnt;
10341031
int queue_map;
1035-
int pasid_tmp;
10361032
int max_queue_cnt;
1037-
int vmid_wave_cnt = 0;
10381033
DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
10391034

10401035
lock_spi_csq_mutexes(adev);
@@ -1048,42 +1043,30 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
10481043
AMDGPU_MAX_QUEUES);
10491044
max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
10501045
adev->gfx.mec.num_queue_per_pipe;
1051-
sh_cnt = adev->gfx.config.max_sh_per_se;
10521046
se_cnt = adev->gfx.config.max_shader_engines;
10531047
for (se_idx = 0; se_idx < se_cnt; se_idx++) {
1054-
for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
1048+
amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
1049+
queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
1050+
1051+
/*
1052+
* Assumption: queue map encodes following schema: four
1053+
* pipes per each micro-engine, with each pipe mapping
1054+
* eight queues. This schema is true for GFX9 devices
1055+
* and must be verified for newer device families
1056+
*/
1057+
for (qidx = 0; qidx < max_queue_cnt; qidx++) {
1058+
/* Skip qeueus that are not associated with
1059+
* compute functions
1060+
*/
1061+
if (!test_bit(qidx, cp_queue_bitmap))
1062+
continue;
10551063

1056-
amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst);
1057-
queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
1064+
if (!(queue_map & (1 << qidx)))
1065+
continue;
10581066

1059-
/*
1060-
* Assumption: queue map encodes following schema: four
1061-
* pipes per each micro-engine, with each pipe mapping
1062-
* eight queues. This schema is true for GFX9 devices
1063-
* and must be verified for newer device families
1064-
*/
1065-
for (qidx = 0; qidx < max_queue_cnt; qidx++) {
1066-
1067-
/* Skip qeueus that are not associated with
1068-
* compute functions
1069-
*/
1070-
if (!test_bit(qidx, cp_queue_bitmap))
1071-
continue;
1072-
1073-
if (!(queue_map & (1 << qidx)))
1074-
continue;
1075-
1076-
/* Get number of waves in flight and aggregate them */
1077-
get_wave_count(adev, qidx, &wave_cnt, &vmid,
1078-
inst);
1079-
if (wave_cnt != 0) {
1080-
pasid_tmp =
1081-
RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
1082-
mmIH_VMID_0_LUT) + vmid);
1083-
if (pasid_tmp == pasid)
1084-
vmid_wave_cnt += wave_cnt;
1085-
}
1086-
}
1067+
/* Get number of waves in flight and aggregate them */
1068+
get_wave_count(adev, qidx, &cu_occupancy[qidx],
1069+
inst);
10871070
}
10881071
}
10891072

@@ -1092,7 +1075,6 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
10921075
unlock_spi_csq_mutexes(adev);
10931076

10941077
/* Update the output parameters and return */
1095-
*pasid_wave_cnt = vmid_wave_cnt;
10961078
*max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
10971079
adev->gfx.cu_info.max_waves_per_simd;
10981080
}

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,9 @@ bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
5252
uint8_t vmid, uint16_t *p_pasid);
5353
void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
5454
uint32_t vmid, uint64_t page_table_base);
55-
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
56-
int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst);
55+
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
56+
struct kfd_cu_occupancy *cu_occupancy,
57+
int *max_waves_per_cu, uint32_t inst);
5758
void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
5859
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
5960
uint32_t inst);

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
35403540
return debug_map_and_unlock(dqm);
35413541
}
35423542

3543+
bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
3544+
struct qcm_process_device *qpd,
3545+
int doorbell_off)
3546+
{
3547+
struct queue *q;
3548+
bool r = false;
3549+
3550+
dqm_lock(dqm);
3551+
3552+
list_for_each_entry(q, &qpd->queues_list, list) {
3553+
if (q->properties.doorbell_off == doorbell_off) {
3554+
r = true;
3555+
goto out;
3556+
}
3557+
}
3558+
3559+
out:
3560+
dqm_unlock(dqm);
3561+
return r;
3562+
}
35433563
#if defined(CONFIG_DEBUG_FS)
35443564

35453565
static void seq_reg_dump(struct seq_file *m,

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q,
324324
int debug_lock_and_unmap(struct device_queue_manager *dqm);
325325
int debug_map_and_unlock(struct device_queue_manager *dqm);
326326
int debug_refresh_runlist(struct device_queue_manager *dqm);
327+
bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
328+
struct qcm_process_device *qpd,
329+
int doorbell_off);
327330

328331
static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
329332
{

drivers/gpu/drm/amd/amdkfd/kfd_process.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
270270
struct kfd_node *dev = NULL;
271271
struct kfd_process *proc = NULL;
272272
struct kfd_process_device *pdd = NULL;
273+
int i;
274+
struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
275+
276+
memset(cu_occupancy, 0x0, sizeof(cu_occupancy));
273277

274278
pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
275279
dev = pdd->dev;
@@ -287,9 +291,17 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
287291
/* Collect wave count from device if it supports */
288292
wave_cnt = 0;
289293
max_waves_per_cu = 0;
290-
dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt,
294+
295+
dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
291296
&max_waves_per_cu, 0);
292297

298+
for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
299+
if (cu_occupancy[i].wave_cnt != 0 &&
300+
kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd,
301+
cu_occupancy[i].doorbell_off))
302+
wave_cnt += cu_occupancy[i].wave_cnt;
303+
}
304+
293305
/* Translate wave count to number of compute units */
294306
cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
295307
return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);

drivers/gpu/drm/amd/include/kgd_kfd_interface.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ enum kgd_memory_pool {
7171
KGD_POOL_FRAMEBUFFER = 3,
7272
};
7373

74+
struct kfd_cu_occupancy {
75+
u32 wave_cnt;
76+
u32 doorbell_off;
77+
};
78+
7479
/**
7580
* enum kfd_sched_policy
7681
*
@@ -313,8 +318,9 @@ struct kfd2kgd_calls {
313318
uint32_t grace_period,
314319
uint32_t *reg_offset,
315320
uint32_t *reg_data);
316-
void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
317-
int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
321+
void (*get_cu_occupancy)(struct amdgpu_device *adev,
322+
struct kfd_cu_occupancy *cu_occupancy,
323+
int *max_waves_per_cu, uint32_t inst);
318324
void (*program_trap_handler_settings)(struct amdgpu_device *adev,
319325
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
320326
uint32_t inst);

0 commit comments

Comments
 (0)