Skip to content

Commit b3ba6a4

Browse files
committed
amd comps: contentious components in same config
Allow the user to configure contentious component pairs (e.g., rocm & rocp_sdk, rocm_smi & amd_smi), but only allow one from each pair to be active at runtime. The ROCm version determines which components are active by default. This can be overridden by the PAPI_DISABLE_COMPONENTS environment variable. These changes have been tested using ROCm 7.0.2 on the Frontier supercomputer, which contains the AMD MI250X architecture.
1 parent b140951 commit b3ba6a4

File tree

6 files changed

+643
-332
lines changed

6 files changed

+643
-332
lines changed

src/components/amd_smi/linux-amd-smi.c

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,40 @@ static int _amd_smi_init_component(int cidx) {
5757
_amd_smi_vector.cmp_info.num_mpx_cntrs = -1;
5858
_amd_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
5959

60-
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
61-
"Not initialized. Access an AMD SMI event to initialize.");
62-
_amd_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
60+
/* Manage contension between rocm_smi and amd_smi components. */
61+
int use_amd_smi = 0;
62+
#if defined(DEFAULT_TO_AMD_SMI)
63+
use_amd_smi = 1;
64+
#endif
65+
#if defined(DEFAULT_TO_ROCM_SMI)
66+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
67+
if (disabledComps != NULL) {
68+
char *penv = strdup(disabledComps);
69+
char *p;
70+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
71+
if(!strcmp(p, "rocm_smi")) use_amd_smi = 1;
72+
}
73+
free(penv);
74+
} else {
75+
SUBDBG("amd_smi: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
76+
}
77+
#endif
78+
79+
int papi_errno;
80+
if (use_amd_smi) {
81+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN,
82+
"Not initialized. Access an AMD SMI event to initialize.");
83+
papi_errno = PAPI_EDELAY_INIT;
84+
_amd_smi_vector.cmp_info.disabled = papi_errno;
85+
return papi_errno;
86+
} else {
87+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN,
88+
"Not active while rocm_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm_smi' to override.");
89+
papi_errno = PAPI_ECOMBO;
90+
_amd_smi_vector.cmp_info.disabled = papi_errno;
91+
return papi_errno;
92+
}
6393

64-
return PAPI_EDELAY_INIT;
6594
}
6695

6796
static int evt_get_count(int *count) {
@@ -93,8 +122,7 @@ static int _amd_smi_init_private(void) {
93122
if (!error_str || !error_str[0])
94123
error_str = "AMD SMI component initialization failed";
95124
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason,
96-
sizeof _amd_smi_vector.cmp_info.disabled_reason, "%s",
97-
error_str);
125+
PAPI_HUGE_STR_LEN, "%s", error_str);
98126
goto fn_fail;
99127
}
100128

src/components/rocm/rocm.c

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -133,25 +133,58 @@ rocm_init_component(int cid)
133133
_rocm_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
134134
SUBDBG("ENTER: cid: %d\n", cid);
135135

136-
int papi_errno = rocd_init_environment();
137-
if (papi_errno != PAPI_OK) {
138-
_rocm_vector.cmp_info.initialized = 1;
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocm = 0;
138+
#if defined(DEFAULT_TO_ROCM)
139+
use_rocm = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCP_SDK)
142+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (disabledComps != NULL) {
144+
char *penv = strdup(disabledComps);
145+
char *p;
146+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
147+
if(!strcmp(p, "rocp_sdk")) use_rocm = 1;
148+
}
149+
free(penv);
150+
} else {
151+
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
152+
}
153+
#endif
154+
155+
int papi_errno, expect;
156+
if (use_rocm) {
157+
papi_errno = rocd_init_environment();
158+
if (papi_errno != PAPI_OK) {
159+
_rocm_vector.cmp_info.initialized = 1;
160+
_rocm_vector.cmp_info.disabled = papi_errno;
161+
const char *err_string;
162+
rocd_err_get_last(&err_string);
163+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
164+
PAPI_HUGE_STR_LEN, "%s", err_string);
165+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
166+
SUBDBG("disabled_reason truncated");
167+
}
168+
goto fn_fail;
169+
}
170+
171+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
172+
"Not initialized. Access component events to initialize it.");
173+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
174+
SUBDBG("disabled_reason truncated");
175+
}
176+
papi_errno = PAPI_EDELAY_INIT;
139177
_rocm_vector.cmp_info.disabled = papi_errno;
140-
const char *err_string;
141-
rocd_err_get_last(&err_string);
142-
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
143-
PAPI_MAX_STR_LEN, "%s", err_string);
144-
if (expect > PAPI_MAX_STR_LEN) {
178+
} else {
179+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
180+
"Not active while rocp_sdk component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocp_sdk' to override.");
181+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
145182
SUBDBG("disabled_reason truncated");
146183
}
147-
goto fn_fail;
184+
papi_errno = PAPI_ECOMBO;
185+
_rocm_vector.cmp_info.disabled = papi_errno;
148186
}
149187

150-
sprintf(_rocm_vector.cmp_info.disabled_reason,
151-
"Not initialized. Access component events to initialize it.");
152-
papi_errno = PAPI_EDELAY_INIT;
153-
_rocm_vector.cmp_info.disabled = papi_errno;
154-
155188
fn_exit:
156189
SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
157190
return papi_errno;
@@ -209,8 +242,8 @@ rocm_init_private(void)
209242
const char *err_string;
210243
rocd_err_get_last(&err_string);
211244
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
212-
PAPI_MAX_STR_LEN, "%s", err_string);
213-
if (expect > PAPI_MAX_STR_LEN) {
245+
PAPI_HUGE_STR_LEN, "%s", err_string);
246+
if (expect > PAPI_HUGE_STR_LEN) {
214247
SUBDBG("disabled_reason truncated");
215248
}
216249

@@ -222,8 +255,8 @@ rocm_init_private(void)
222255
_rocm_vector.cmp_info.num_native_events = count;
223256
_rocm_vector.cmp_info.num_cntrs = count;
224257
_rocm_vector.cmp_info.initialized = 1;
225-
int strLen = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", "");
226-
if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) {
258+
int strLen = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", "");
259+
if (strLen < 0 || strLen >= PAPI_HUGE_STR_LEN) {
227260
SUBDBG("Failed to fully write disabled_reason.\n");
228261
}
229262

src/components/rocm_smi/linux-rocm-smi.c

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,45 @@ _rocm_smi_init_component(int cidx)
6868
_rocm_smi_vector.cmp_info.num_cntrs = -1;
6969
_rocm_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
7070

71-
sprintf(_rocm_smi_vector.cmp_info.disabled_reason,
72-
"Not initialized. Access component events to initialize it.");
73-
_rocm_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
74-
75-
return PAPI_EDELAY_INIT;
71+
/* Manage contension between rocm_smi and amd_smi components. */
72+
int use_rocm_smi = 0;
73+
#if defined(DEFAULT_TO_ROCM_SMI)
74+
use_rocm_smi = 1;
75+
#endif
76+
#if defined(DEFAULT_TO_AMD_SMI)
77+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
78+
if (disabledComps != NULL) {
79+
char *penv = strdup(disabledComps);
80+
char *p;
81+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
82+
if(!strcmp(p, "amd_smi")) use_rocm_smi = 1;
83+
}
84+
free(penv);
85+
} else {
86+
SUBDBG("rocm_smi: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
87+
}
88+
#endif
89+
90+
int papi_errno, expect;
91+
if (use_rocm_smi) {
92+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
93+
"Not initialized. Access component events to initialize it.");
94+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
95+
SUBDBG("disabled_reason truncated");
96+
}
97+
papi_errno = PAPI_EDELAY_INIT;
98+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
99+
return papi_errno;
100+
} else {
101+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
102+
"Not active while amd_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=amd_smi' to override.");
103+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
104+
SUBDBG("disabled_reason truncated");
105+
}
106+
papi_errno = PAPI_ECOMBO;
107+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
108+
return papi_errno;
109+
}
76110
}
77111

78112
static int
@@ -108,7 +142,10 @@ _rocm_smi_init_private(void)
108142
_rocm_smi_vector.cmp_info.disabled = papi_errno;
109143
const char *error_str;
110144
rocs_err_get_last(&error_str);
111-
sprintf(_rocm_smi_vector.cmp_info.disabled_reason, "%s", error_str);
145+
int expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", error_str);
146+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
147+
SUBDBG("disabled_reason truncated");
148+
}
112149
goto fn_fail;
113150
}
114151

@@ -125,6 +162,12 @@ _rocm_smi_init_private(void)
125162

126163
fn_exit:
127164
_rocm_smi_vector.cmp_info.disabled = papi_errno;
165+
if(PAPI_OK == papi_errno) {
166+
int expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", "");
167+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
168+
SUBDBG("disabled_reason truncated");
169+
}
170+
}
128171
PAPI_unlock(COMPONENT_LOCK);
129172
return papi_errno;
130173
fn_fail:

src/components/rocp_sdk/rocp_sdk.c

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -133,23 +133,57 @@ rocp_sdk_init_component(int cid)
133133
_rocp_sdk_vector.cmp_info.num_cntrs = -1;
134134
_rocp_sdk_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
135135

136-
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
137-
// It is not critical, so if it fails to be set, we can safely ignore the error.
138-
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocp_sdk = 0;
138+
#if defined(DEFAULT_TO_ROCP_SDK)
139+
use_rocp_sdk = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCM)
142+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (disabledComps != NULL) {
144+
char *penv = strdup(disabledComps);
145+
char *p;
146+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
147+
if(!strcmp(p, "rocm")) use_rocp_sdk = 1;
148+
}
149+
free(penv);
150+
} else {
151+
SUBDBG("rocp_sdk: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
152+
}
153+
#endif
139154

140-
int papi_errno = rocprofiler_sdk_init_pre();
141-
if (papi_errno != PAPI_OK) {
142-
_rocp_sdk_vector.cmp_info.initialized = 1;
155+
int papi_errno, expect;
156+
if( use_rocp_sdk) {
157+
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
158+
// It is not critical, so if it fails to be set, we can safely ignore the error.
159+
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
160+
161+
papi_errno = rocprofiler_sdk_init_pre();
162+
if (papi_errno != PAPI_OK) {
163+
_rocp_sdk_vector.cmp_info.initialized = 1;
164+
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
165+
const char *err_string;
166+
rocprofiler_sdk_err_get_last(&err_string);
167+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", err_string);
168+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
169+
SUBDBG("disabled_reason truncated");
170+
}
171+
return papi_errno;
172+
}
173+
174+
// This component needs to be fully initialized from the beginning,
175+
// because interleaving hip calls and PAPI calls leads to errors.
176+
return check_n_initialize();
177+
} else {
178+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
179+
"Not active while rocm component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm' to override.");
180+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
181+
SUBDBG("disabled_reason truncated");
182+
}
183+
papi_errno = PAPI_ECOMBO;
143184
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
144-
const char *err_string;
145-
rocprofiler_sdk_err_get_last(&err_string);
146-
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
147185
return papi_errno;
148186
}
149-
150-
// This component needs to be fully initialized from the beginning,
151-
// because interleaving hip calls and PAPI calls leads to errors.
152-
return check_n_initialize();
153187
}
154188

155189
int
@@ -205,7 +239,10 @@ rocp_sdk_init_private(void)
205239
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
206240
const char *err_string;
207241
rocprofiler_sdk_err_get_last(&err_string);
208-
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
242+
int expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", err_string);
243+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
244+
SUBDBG("disabled_reason truncated");
245+
}
209246
goto fn_fail;
210247
}
211248

0 commit comments

Comments
 (0)