Skip to content

Commit 7e93f50

Browse files
committed
amd comps: contentious components in same config
Allow the user to configure contentious component pairs (e.g., rocm & rocp_sdk, rocm_smi & amd_smi), but only allow one from each pair to be active at runtime. The ROCm version determines which components are active by default. This can be overridden by the PAPI_DISABLE_COMPONENTS environment variable. These changes have been tested using ROCm 7.0.2 on the Frontier supercomputer, which contains the AMD MI250X architecture.
1 parent 8c02a4e commit 7e93f50

File tree

6 files changed

+521
-337
lines changed

6 files changed

+521
-337
lines changed

src/components/amd_smi/linux-amd-smi.c

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,39 @@ static int _amd_smi_init_component(int cidx) {
5757
_amd_smi_vector.cmp_info.num_mpx_cntrs = -1;
5858
_amd_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
5959

60-
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
61-
"Not initialized. Access an AMD SMI event to initialize.");
62-
_amd_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
60+
/* Manage contension between rocm_smi and amd_smi components. */
61+
int use_amd_smi = 0;
62+
#if defined(DEFAULT_TO_AMD_SMI)
63+
use_amd_smi = 1;
64+
#endif
65+
#if defined(DEFAULT_TO_ROCM_SMI)
66+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
67+
if (disabledComps != NULL) {
68+
char *penv = strdup(disabledComps);
69+
char *p;
70+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
71+
if(!strcmp(p, "rocm_smi")) use_amd_smi = 1;
72+
}
73+
} else {
74+
SUBDBG("amd_smi: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
75+
}
76+
#endif
77+
78+
int papi_errno;
79+
if (use_amd_smi) {
80+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
81+
"Not initialized. Access an AMD SMI event to initialize.");
82+
papi_errno = PAPI_EDELAY_INIT;
83+
_amd_smi_vector.cmp_info.disabled = papi_errno;
84+
return papi_errno;
85+
} else {
86+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
87+
"Not active while rocm_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm_smi' to override.");
88+
papi_errno = PAPI_ECOMBO;
89+
_amd_smi_vector.cmp_info.disabled = papi_errno;
90+
return papi_errno;
91+
}
6392

64-
return PAPI_EDELAY_INIT;
6593
}
6694

6795
static int evt_get_count(int *count) {

src/components/rocm/rocm.c

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -133,25 +133,57 @@ rocm_init_component(int cid)
133133
_rocm_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
134134
SUBDBG("ENTER: cid: %d\n", cid);
135135

136-
int papi_errno = rocd_init_environment();
137-
if (papi_errno != PAPI_OK) {
138-
_rocm_vector.cmp_info.initialized = 1;
139-
_rocm_vector.cmp_info.disabled = papi_errno;
140-
const char *err_string;
141-
rocd_err_get_last(&err_string);
142-
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocm = 0;
138+
#if defined(DEFAULT_TO_ROCM)
139+
use_rocm = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCP_SDK)
142+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (disabledComps != NULL) {
144+
char *penv = strdup(disabledComps);
145+
char *p;
146+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
147+
if(!strcmp(p, "rocp_sdk")) use_rocm = 1;
148+
}
149+
} else {
150+
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
151+
}
152+
#endif
153+
154+
int papi_errno, expect;
155+
if (use_rocm) {
156+
papi_errno = rocd_init_environment();
157+
if (papi_errno != PAPI_OK) {
158+
_rocm_vector.cmp_info.initialized = 1;
159+
_rocm_vector.cmp_info.disabled = papi_errno;
160+
const char *err_string;
161+
rocd_err_get_last(&err_string);
162+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
143163
PAPI_MAX_STR_LEN, "%s", err_string);
144-
if (expect > PAPI_MAX_STR_LEN) {
164+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
165+
SUBDBG("disabled_reason truncated");
166+
}
167+
goto fn_fail;
168+
}
169+
170+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
171+
"Not initialized. Access component events to initialize it.");
172+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
145173
SUBDBG("disabled_reason truncated");
146174
}
147-
goto fn_fail;
175+
papi_errno = PAPI_EDELAY_INIT;
176+
_rocm_vector.cmp_info.disabled = papi_errno;
177+
} else {
178+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
179+
"Not active while rocp_sdk component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocp_sdk' to override.");
180+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
181+
SUBDBG("disabled_reason truncated");
182+
}
183+
papi_errno = PAPI_ECOMBO;
184+
_rocm_vector.cmp_info.disabled = papi_errno;
148185
}
149186

150-
sprintf(_rocm_vector.cmp_info.disabled_reason,
151-
"Not initialized. Access component events to initialize it.");
152-
papi_errno = PAPI_EDELAY_INIT;
153-
_rocm_vector.cmp_info.disabled = papi_errno;
154-
155187
fn_exit:
156188
SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
157189
return papi_errno;

src/components/rocm_smi/linux-rocm-smi.c

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,44 @@ _rocm_smi_init_component(int cidx)
6868
_rocm_smi_vector.cmp_info.num_cntrs = -1;
6969
_rocm_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
7070

71-
sprintf(_rocm_smi_vector.cmp_info.disabled_reason,
72-
"Not initialized. Access component events to initialize it.");
73-
_rocm_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
74-
75-
return PAPI_EDELAY_INIT;
71+
/* Manage contension between rocm_smi and amd_smi components. */
72+
int use_rocm_smi = 0;
73+
#if defined(DEFAULT_TO_ROCM_SMI)
74+
use_rocm_smi = 1;
75+
#endif
76+
#if defined(DEFAULT_TO_AMD_SMI)
77+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
78+
if (disabledComps != NULL) {
79+
char *penv = strdup(disabledComps);
80+
char *p;
81+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
82+
if(!strcmp(p, "amd_smi")) use_rocm_smi = 1;
83+
}
84+
} else {
85+
SUBDBG("rocm_smi: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
86+
}
87+
#endif
88+
89+
int papi_errno, expect;
90+
if (use_rocm_smi) {
91+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
92+
"Not initialized. Access component events to initialize it.");
93+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
94+
SUBDBG("disabled_reason truncated");
95+
}
96+
papi_errno = PAPI_EDELAY_INIT;
97+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
98+
return papi_errno;
99+
} else {
100+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
101+
"Not active while amd_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=amd_smi' to override.");
102+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
103+
SUBDBG("disabled_reason truncated");
104+
}
105+
papi_errno = PAPI_ECOMBO;
106+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
107+
return papi_errno;
108+
}
76109
}
77110

78111
static int

src/components/rocp_sdk/rocp_sdk.c

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -133,23 +133,56 @@ rocp_sdk_init_component(int cid)
133133
_rocp_sdk_vector.cmp_info.num_cntrs = -1;
134134
_rocp_sdk_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
135135

136-
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
137-
// It is not critical, so if it fails to be set, we can safely ignore the error.
138-
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocp_sdk = 0;
138+
#if defined(DEFAULT_TO_ROCP_SDK)
139+
use_rocp_sdk = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCM)
142+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (disabledComps != NULL) {
144+
char *penv = strdup(disabledComps);
145+
char *p;
146+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
147+
if(!strcmp(p, "rocm")) use_rocp_sdk = 1;
148+
}
149+
} else {
150+
SUBDBG("rocp_sdk: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
151+
}
152+
#endif
139153

140-
int papi_errno = rocprofiler_sdk_init_pre();
141-
if (papi_errno != PAPI_OK) {
142-
_rocp_sdk_vector.cmp_info.initialized = 1;
154+
int papi_errno, expect;
155+
if( use_rocp_sdk) {
156+
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
157+
// It is not critical, so if it fails to be set, we can safely ignore the error.
158+
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
159+
160+
papi_errno = rocprofiler_sdk_init_pre();
161+
if (papi_errno != PAPI_OK) {
162+
_rocp_sdk_vector.cmp_info.initialized = 1;
163+
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
164+
const char *err_string;
165+
rocprofiler_sdk_err_get_last(&err_string);
166+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
167+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
168+
SUBDBG("disabled_reason truncated");
169+
}
170+
return papi_errno;
171+
}
172+
173+
// This component needs to be fully initialized from the beginning,
174+
// because interleaving hip calls and PAPI calls leads to errors.
175+
return check_n_initialize();
176+
} else {
177+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
178+
"Not active while rocm component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm' to override.");
179+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
180+
SUBDBG("disabled_reason truncated");
181+
}
182+
papi_errno = PAPI_ECOMBO;
143183
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
144-
const char *err_string;
145-
rocprofiler_sdk_err_get_last(&err_string);
146-
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
147184
return papi_errno;
148185
}
149-
150-
// This component needs to be fully initialized from the beginning,
151-
// because interleaving hip calls and PAPI calls leads to errors.
152-
return check_n_initialize();
153186
}
154187

155188
int

0 commit comments

Comments
 (0)