@@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
950950 * @inst: xcc's instance number on a multi-XCC setup
951951 */
952952static void get_wave_count (struct amdgpu_device * adev , int queue_idx ,
953- int * wave_cnt , int * vmid , uint32_t inst )
953+ struct kfd_cu_occupancy * queue_cnt , uint32_t inst )
954954{
955955 int pipe_idx ;
956956 int queue_slot ;
957957 unsigned int reg_val ;
958-
958+ unsigned int wave_cnt ;
959959 /*
960960 * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
961961 * parameters to read out waves in flight. Get VMID if there are
962962 * non-zero waves in flight.
963963 */
964- * vmid = 0xFF ;
965- * wave_cnt = 0 ;
966964 pipe_idx = queue_idx / adev -> gfx .mec .num_queue_per_pipe ;
967965 queue_slot = queue_idx % adev -> gfx .mec .num_queue_per_pipe ;
968966 soc15_grbm_select (adev , 1 , pipe_idx , queue_slot , 0 , inst );
969- reg_val = RREG32_SOC15_IP (GC , SOC15_REG_OFFSET (GC , inst , mmSPI_CSQ_WF_ACTIVE_COUNT_0 ) +
970- queue_slot );
971- * wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK ;
972- if (* wave_cnt != 0 )
973- * vmid = (RREG32_SOC15 (GC , inst , mmCP_HQD_VMID ) &
974- CP_HQD_VMID__VMID_MASK ) >> CP_HQD_VMID__VMID__SHIFT ;
967+ reg_val = RREG32_SOC15_IP (GC , SOC15_REG_OFFSET (GC , inst ,
968+ mmSPI_CSQ_WF_ACTIVE_COUNT_0 ) + queue_slot );
969+ wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK ;
970+ if (wave_cnt != 0 ) {
971+ queue_cnt -> wave_cnt += wave_cnt ;
972+ queue_cnt -> doorbell_off =
973+ (RREG32_SOC15 (GC , inst , mmCP_HQD_PQ_DOORBELL_CONTROL ) &
974+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK ) >>
975+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT ;
976+ }
975977}
976978
977979/**
@@ -981,9 +983,8 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
981983 * or more queues running and submitting waves to compute units.
982984 *
983985 * @adev: Handle of device from which to get number of waves in flight
984- * @pasid: Identifies the process for which this query call is invoked
985- * @pasid_wave_cnt: Output parameter updated with number of waves in flight that
986- * belong to process with given pasid
986+ * @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset
987+ * for comparison later.
987988 * @max_waves_per_cu: Output parameter updated with maximum number of waves
988989 * possible per Compute Unit
989990 * @inst: xcc's instance number on a multi-XCC setup
@@ -1011,30 +1012,24 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
10111012 * number of waves that are in flight for the queue at specified index. The
10121013 * index ranges from 0 to 7.
10131014 *
1014- * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
1015- * of the wave(s) .
1015+ * If non-zero waves are in flight, store the corresponding doorbell offset
1016+ * of the queue, along with the wave count .
10161017 *
1017- * Determine if VMID from above step maps to pasid provided as parameter. If
1018- * it matches agrregate the wave count. That the VMID will not match pasid is
1019- * a normal condition i.e. a device is expected to support multiple queues
1020- * from multiple proceses.
1018+ * Determine if the queue belongs to the process by comparing the doorbell
1019+ * offset against the process's queues. If it matches, aggregate the wave
1020+ * count for the process.
10211021 *
10221022 * Reading registers referenced above involves programming GRBM appropriately
10231023 */
1024- void kgd_gfx_v9_get_cu_occupancy (struct amdgpu_device * adev , int pasid ,
1025- int * pasid_wave_cnt , int * max_waves_per_cu , uint32_t inst )
1024+ void kgd_gfx_v9_get_cu_occupancy (struct amdgpu_device * adev ,
1025+ struct kfd_cu_occupancy * cu_occupancy ,
1026+ int * max_waves_per_cu , uint32_t inst )
10261027{
10271028 int qidx ;
1028- int vmid ;
10291029 int se_idx ;
1030- int sh_idx ;
10311030 int se_cnt ;
1032- int sh_cnt ;
1033- int wave_cnt ;
10341031 int queue_map ;
1035- int pasid_tmp ;
10361032 int max_queue_cnt ;
1037- int vmid_wave_cnt = 0 ;
10381033 DECLARE_BITMAP (cp_queue_bitmap , AMDGPU_MAX_QUEUES );
10391034
10401035 lock_spi_csq_mutexes (adev );
@@ -1048,42 +1043,30 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
10481043 AMDGPU_MAX_QUEUES );
10491044 max_queue_cnt = adev -> gfx .mec .num_pipe_per_mec *
10501045 adev -> gfx .mec .num_queue_per_pipe ;
1051- sh_cnt = adev -> gfx .config .max_sh_per_se ;
10521046 se_cnt = adev -> gfx .config .max_shader_engines ;
10531047 for (se_idx = 0 ; se_idx < se_cnt ; se_idx ++ ) {
1054- for (sh_idx = 0 ; sh_idx < sh_cnt ; sh_idx ++ ) {
1048+ amdgpu_gfx_select_se_sh (adev , se_idx , 0 , 0xffffffff , inst );
1049+ queue_map = RREG32_SOC15 (GC , inst , mmSPI_CSQ_WF_ACTIVE_STATUS );
1050+
1051+ /*
1052+ * Assumption: queue map encodes following schema: four
1053+ * pipes per each micro-engine, with each pipe mapping
1054+ * eight queues. This schema is true for GFX9 devices
1055+ * and must be verified for newer device families
1056+ */
1057+ for (qidx = 0 ; qidx < max_queue_cnt ; qidx ++ ) {
1058+ /* Skip qeueus that are not associated with
1059+ * compute functions
1060+ */
1061+ if (!test_bit (qidx , cp_queue_bitmap ))
1062+ continue ;
10551063
1056- amdgpu_gfx_select_se_sh ( adev , se_idx , sh_idx , 0xffffffff , inst );
1057- queue_map = RREG32_SOC15 ( GC , inst , mmSPI_CSQ_WF_ACTIVE_STATUS ) ;
1064+ if (!( queue_map & ( 1 << qidx )))
1065+ continue ;
10581066
1059- /*
1060- * Assumption: queue map encodes following schema: four
1061- * pipes per each micro-engine, with each pipe mapping
1062- * eight queues. This schema is true for GFX9 devices
1063- * and must be verified for newer device families
1064- */
1065- for (qidx = 0 ; qidx < max_queue_cnt ; qidx ++ ) {
1066-
1067- /* Skip qeueus that are not associated with
1068- * compute functions
1069- */
1070- if (!test_bit (qidx , cp_queue_bitmap ))
1071- continue ;
1072-
1073- if (!(queue_map & (1 << qidx )))
1074- continue ;
1075-
1076- /* Get number of waves in flight and aggregate them */
1077- get_wave_count (adev , qidx , & wave_cnt , & vmid ,
1078- inst );
1079- if (wave_cnt != 0 ) {
1080- pasid_tmp =
1081- RREG32 (SOC15_REG_OFFSET (OSSSYS , inst ,
1082- mmIH_VMID_0_LUT ) + vmid );
1083- if (pasid_tmp == pasid )
1084- vmid_wave_cnt += wave_cnt ;
1085- }
1086- }
1067+ /* Get number of waves in flight and aggregate them */
1068+ get_wave_count (adev , qidx , & cu_occupancy [qidx ],
1069+ inst );
10871070 }
10881071 }
10891072
@@ -1092,7 +1075,6 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
10921075 unlock_spi_csq_mutexes (adev );
10931076
10941077 /* Update the output parameters and return */
1095- * pasid_wave_cnt = vmid_wave_cnt ;
10961078 * max_waves_per_cu = adev -> gfx .cu_info .simd_per_cu *
10971079 adev -> gfx .cu_info .max_waves_per_simd ;
10981080}
0 commit comments