Skip to content

Commit e3d7a5d

Browse files
committed
amd comps: contentious components in same config
Allow the user to configure contentious component pairs (e.g., rocm & rocp_sdk, rocm_smi & amd_smi), but only allow one from each pair to be active at runtime. The ROCm version determines which components are active by default. This can be overridden by the PAPI_DISABLE_COMPONENTS environment variable. These changes have been tested using ROCm 7.0.2 on the Frontier supercomputer, which contains the AMD MI250X architecture.
1 parent 2e02a9c commit e3d7a5d

File tree

6 files changed

+517
-337
lines changed

6 files changed

+517
-337
lines changed

src/components/amd_smi/linux-amd-smi.c

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,38 @@ static int _amd_smi_init_component(int cidx) {
5757
_amd_smi_vector.cmp_info.num_mpx_cntrs = -1;
5858
_amd_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
5959

60-
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
61-
"Not initialized. Access an AMD SMI event to initialize.");
62-
_amd_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
60+
/* Manage contension between rocm_smi and amd_smi components. */
61+
int use_amd_smi = 0;
62+
#if defined(DEFAULT_TO_AMD_SMI)
63+
use_amd_smi = 1;
64+
#endif
65+
#if defined(DEFAULT_TO_ROCM_SMI)
66+
char *penv = getenv("PAPI_DISABLE_COMPONENTS");
67+
if (penv != NULL) {
68+
char *p;
69+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
70+
if(!strcmp(p, "rocm_smi")) use_amd_smi = 1;
71+
}
72+
} else {
73+
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
74+
}
75+
#endif
76+
77+
int papi_errno;
78+
if (use_amd_smi) {
79+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
80+
"Not initialized. Access an AMD SMI event to initialize.");
81+
papi_errno = PAPI_EDELAY_INIT;
82+
_amd_smi_vector.cmp_info.disabled = papi_errno;
83+
return papi_errno;
84+
} else {
85+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
86+
"Not active while rocm_smi component is active. Set PAPI_DISABLE_COMPONENTS=amd_smi to override.");
87+
papi_errno = PAPI_ECOMBO;
88+
_amd_smi_vector.cmp_info.disabled = papi_errno;
89+
return papi_errno;
90+
}
6391

64-
return PAPI_EDELAY_INIT;
6592
}
6693

6794
static int evt_get_count(int *count) {

src/components/rocm/rocm.c

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -133,25 +133,56 @@ rocm_init_component(int cid)
133133
_rocm_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
134134
SUBDBG("ENTER: cid: %d\n", cid);
135135

136-
int papi_errno = rocd_init_environment();
137-
if (papi_errno != PAPI_OK) {
138-
_rocm_vector.cmp_info.initialized = 1;
139-
_rocm_vector.cmp_info.disabled = papi_errno;
140-
const char *err_string;
141-
rocd_err_get_last(&err_string);
142-
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocm = 0;
138+
#if defined(DEFAULT_TO_ROCM)
139+
use_rocm = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCP_SDK)
142+
char *penv = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (penv != NULL) {
144+
char *p;
145+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
146+
if(!strcmp(p, "rocp_sdk")) use_rocm = 1;
147+
}
148+
} else {
149+
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
150+
}
151+
#endif
152+
153+
int papi_errno, expect;
154+
if (use_rocm) {
155+
papi_errno = rocd_init_environment();
156+
if (papi_errno != PAPI_OK) {
157+
_rocm_vector.cmp_info.initialized = 1;
158+
_rocm_vector.cmp_info.disabled = papi_errno;
159+
const char *err_string;
160+
rocd_err_get_last(&err_string);
161+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
143162
PAPI_MAX_STR_LEN, "%s", err_string);
144-
if (expect > PAPI_MAX_STR_LEN) {
163+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
164+
SUBDBG("disabled_reason truncated");
165+
}
166+
goto fn_fail;
167+
}
168+
169+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
170+
"Not initialized. Access component events to initialize it.");
171+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
145172
SUBDBG("disabled_reason truncated");
146173
}
147-
goto fn_fail;
174+
papi_errno = PAPI_EDELAY_INIT;
175+
_rocm_vector.cmp_info.disabled = papi_errno;
176+
} else {
177+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
178+
"Not active while rocp_sdk component is active. Set PAPI_DISABLE_COMPONENTS=rocp_sdk to override.");
179+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
180+
SUBDBG("disabled_reason truncated");
181+
}
182+
papi_errno = PAPI_ECOMBO;
183+
_rocm_vector.cmp_info.disabled = papi_errno;
148184
}
149185

150-
sprintf(_rocm_vector.cmp_info.disabled_reason,
151-
"Not initialized. Access component events to initialize it.");
152-
papi_errno = PAPI_EDELAY_INIT;
153-
_rocm_vector.cmp_info.disabled = papi_errno;
154-
155186
fn_exit:
156187
SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
157188
return papi_errno;

src/components/rocm_smi/linux-rocm-smi.c

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,43 @@ _rocm_smi_init_component(int cidx)
6868
_rocm_smi_vector.cmp_info.num_cntrs = -1;
6969
_rocm_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
7070

71-
sprintf(_rocm_smi_vector.cmp_info.disabled_reason,
72-
"Not initialized. Access component events to initialize it.");
73-
_rocm_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
74-
75-
return PAPI_EDELAY_INIT;
71+
/* Manage contension between rocm_smi and amd_smi components. */
72+
int use_rocm_smi = 0;
73+
#if defined(DEFAULT_TO_ROCM_SMI)
74+
use_rocm_smi = 1;
75+
#endif
76+
#if defined(DEFAULT_TO_AMD_SMI)
77+
char *penv = getenv("PAPI_DISABLE_COMPONENTS");
78+
if (penv != NULL) {
79+
char *p;
80+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
81+
if(!strcmp(p, "amd_smi")) use_rocm_smi = 1;
82+
}
83+
} else {
84+
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
85+
}
86+
#endif
87+
88+
int papi_errno, expect;
89+
if (use_rocm_smi) {
90+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
91+
"Not initialized. Access component events to initialize it.");
92+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
93+
SUBDBG("disabled_reason truncated");
94+
}
95+
papi_errno = PAPI_EDELAY_INIT;
96+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
97+
return papi_errno;
98+
} else {
99+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
100+
"Not active while amd_smi component is active. Set PAPI_DISABLE_COMPONENTS=amd_smi to override.");
101+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
102+
SUBDBG("disabled_reason truncated");
103+
}
104+
papi_errno = PAPI_ECOMBO;
105+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
106+
return papi_errno;
107+
}
76108
}
77109

78110
static int

src/components/rocp_sdk/rocp_sdk.c

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -133,23 +133,55 @@ rocp_sdk_init_component(int cid)
133133
_rocp_sdk_vector.cmp_info.num_cntrs = -1;
134134
_rocp_sdk_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
135135

136-
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
137-
// It is not critical, so if it fails to be set, we can safely ignore the error.
138-
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocp_sdk = 0;
138+
#if defined(DEFAULT_TO_ROCP_SDK)
139+
use_rocp_sdk = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCM)
142+
char *penv = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (penv != NULL) {
144+
char *p;
145+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
146+
if(!strcmp(p, "rocm")) use_rocp_sdk = 1;
147+
}
148+
} else {
149+
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) failed.\n");
150+
}
151+
#endif
139152

140-
int papi_errno = rocprofiler_sdk_init_pre();
141-
if (papi_errno != PAPI_OK) {
142-
_rocp_sdk_vector.cmp_info.initialized = 1;
153+
int papi_errno, expect;
154+
if( use_rocp_sdk) {
155+
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
156+
// It is not critical, so if it fails to be set, we can safely ignore the error.
157+
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
158+
159+
papi_errno = rocprofiler_sdk_init_pre();
160+
if (papi_errno != PAPI_OK) {
161+
_rocp_sdk_vector.cmp_info.initialized = 1;
162+
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
163+
const char *err_string;
164+
rocprofiler_sdk_err_get_last(&err_string);
165+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
166+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
167+
SUBDBG("disabled_reason truncated");
168+
}
169+
return papi_errno;
170+
}
171+
172+
// This component needs to be fully initialized from the beginning,
173+
// because interleaving hip calls and PAPI calls leads to errors.
174+
return check_n_initialize();
175+
} else {
176+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s",
177+
"Not active while rocm component is active. Set PAPI_DISABLE_COMPONENTS=rocm to override.");
178+
if (expect < 0 || expect >= PAPI_MAX_STR_LEN) {
179+
SUBDBG("disabled_reason truncated");
180+
}
181+
papi_errno = PAPI_ECOMBO;
143182
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
144-
const char *err_string;
145-
rocprofiler_sdk_err_get_last(&err_string);
146-
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
147183
return papi_errno;
148184
}
149-
150-
// This component needs to be fully initialized from the beginning,
151-
// because interleaving hip calls and PAPI calls leads to errors.
152-
return check_n_initialize();
153185
}
154186

155187
int

0 commit comments

Comments
 (0)