Skip to content

Commit c9635ac

Browse files
[RDC] Fix GPU_COUNT metric to only count GPUs
* [RDC] Fix GPU_COUNT metric to only count GPUs * [RDC] Clean up float->double casts [rocm-systems] ROCm/rocm-systems#1453 (commit a2cff3c)
1 parent c9cee8d commit c9635ac

File tree

3 files changed

+44
-7
lines changed

3 files changed

+44
-7
lines changed

rdc_libs/rdc/src/RdcEmbeddedHandler.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(
225225
}
226226

227227
if (component == RDC_AMDSMI_COMPONENT) {
228-
amdsmi_status_t ret;
228+
amdsmi_status_t ret = AMDSMI_STATUS_UNKNOWN_ERROR;
229229
amdsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr};
230230

231231
ret = amdsmi_get_lib_version(&ver);

rdc_libs/rdc/src/RdcMetricFetcherImpl.cc

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,10 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
8787
}
8888

8989
uint64_t RdcMetricFetcherImpl::now() {
90+
// WHY does clang-format like to randomly add space after tv and then randomly remove it?
91+
// clang-format off
9092
struct timeval tv {};
93+
// clang-format on
9194
gettimeofday(&tv, nullptr);
9295
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
9396
}
@@ -592,12 +595,46 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel
592595
break;
593596
}
594597
case RDC_FI_GPU_COUNT: {
598+
uint32_t gpu_count = 0;
595599
uint32_t socket_count = 0;
600+
std::vector<amdsmi_socket_handle> socket_handles;
596601
value->status = amdsmi_get_socket_handles(&socket_count, nullptr);
597602
value->type = INTEGER;
598-
if (value->status == AMDSMI_STATUS_SUCCESS) {
599-
value->value.l_int = static_cast<int64_t>(socket_count);
603+
if (value->status != AMDSMI_STATUS_SUCCESS) {
604+
break;
605+
}
606+
socket_handles.resize(socket_count);
607+
value->status = amdsmi_get_socket_handles(&socket_count, socket_handles.data());
608+
if (value->status != AMDSMI_STATUS_SUCCESS) {
609+
break;
610+
}
611+
for (uint32_t i = 0; i < socket_count; i++) {
612+
uint32_t proc_count = 0;
613+
amdsmi_status_t status = AMDSMI_STATUS_UNKNOWN_ERROR;
614+
status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, nullptr);
615+
if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) {
616+
continue;
617+
}
618+
// only need to check the first processor in socket.
619+
// sockets don't mix CPUs and GPUs.. I hope.
620+
proc_count = 1;
621+
amdsmi_processor_handle proc = nullptr;
622+
status = amdsmi_get_processor_handles(socket_handles[i], &proc_count, &proc);
623+
if ((status != AMDSMI_STATUS_SUCCESS) || (proc_count < 1)) {
624+
continue;
625+
}
626+
processor_type_t proc_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN;
627+
status = amdsmi_get_processor_type(proc, &proc_type);
628+
if (status != AMDSMI_STATUS_SUCCESS) {
629+
continue;
630+
}
631+
// only count AMD GPUs
632+
// only count 1 GPU per socket
633+
if (proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
634+
gpu_count++;
635+
}
600636
}
637+
value->value.l_int = static_cast<int64_t>(gpu_count);
601638
} break;
602639
case RDC_FI_GPU_PARTITION_COUNT: {
603640
uint32_t partition_count = 0;

rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
364364
// RDC_FI_PROF_GPU_UTIL_PERCENT is mapped to GPU_UTIL
365365
// GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE.
366366
// ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves
367-
data->dbl = read_dbl / 100.0F;
367+
data->dbl = read_dbl / 100.0;
368368
break;
369369
case RDC_FI_PROF_OCC_ELAPSED: {
370370
// RDC_FI_PROF_OCC_ELAPSED is mapped to GRBM_GUI_ACTIVE, the read happens earlier in this
@@ -389,9 +389,9 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
389389
const bool isMI200 = (target_version.find("gfx90a") != std::string::npos);
390390
// FLOPS/clock/CU
391391
if (isMI200) {
392-
data->dbl = divided_dbl / (1024.0F / static_cast<double>(agents[agent_index].simd_per_cu));
392+
data->dbl = divided_dbl / (1024.0 / static_cast<double>(agents[agent_index].simd_per_cu));
393393
} else { // Assume mi300
394-
data->dbl = divided_dbl / (2048.0F / static_cast<double>(agents[agent_index].simd_per_cu));
394+
data->dbl = divided_dbl / (2048.0 / static_cast<double>(agents[agent_index].simd_per_cu));
395395
}
396396
} break;
397397
case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT:
@@ -401,7 +401,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, rdc_field_value
401401
return RDC_ST_BAD_PARAMETER;
402402
}
403403
// FLOPS/clock/CU
404-
data->dbl = divided_dbl / (256.0F / static_cast<double>(agents[agent_index].simd_per_cu));
404+
data->dbl = divided_dbl / (256.0 / static_cast<double>(agents[agent_index].simd_per_cu));
405405
break;
406406
case RDC_FI_PROF_KFD_ID: {
407407
// do not care what it is mapped to. read value from agents

0 commit comments

Comments
 (0)