Skip to content

Commit fb14952

Browse files
committed
amd comps: contentious components in same config
Allow the user to configure contentious component pairs (e.g., rocm & rocp_sdk, rocm_smi & amd_smi), but only allow one from each pair to be active at runtime. The ROCm version determines which components are active by default. This can be overridden by the PAPI_DISABLE_COMPONENTS environment variable. These changes have been tested using ROCm 7.0.2 on the Frontier supercomputer, which contains the AMD MI250X architecture.
1 parent 73f2741 commit fb14952

File tree

10 files changed

+666
-332
lines changed

10 files changed

+666
-332
lines changed

src/components/amd_smi/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,10 @@ After changing `PAPI_AMDSMI_ROOT` or related library paths, rerun make clobber &
107107

108108
## Hardware and Software Support
109109
To see the `amd_smi` component's current supported hardware and software please visit the GitHub wiki page [Hardware and Software Support - AMD\_SMI Component](https://github.com/icl-utk-edu/papi/wiki/Hardware-and-Software-Support-%E2%80%90-AMD_SMI-Component).
110+
111+
## Known Limitations
112+
113+
* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm_smi` and `amd_smi`.
114+
If both components are configured, then `rocm_smi` will be active by default for ROCm < 6.4.0; `amd_smi` will be active by default for ROCm >= 6.4.0.
115+
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm_smi` when `rocm_smi` is active by default, or
116+
`export PAPI_DISABLE_COMPONENTS=amd_smi` when `amd_smi` is active by default.

src/components/amd_smi/linux-amd-smi.c

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,40 @@ static int _amd_smi_init_component(int cidx) {
5757
_amd_smi_vector.cmp_info.num_mpx_cntrs = -1;
5858
_amd_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
5959

60-
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
61-
"Not initialized. Access an AMD SMI event to initialize.");
62-
_amd_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
60+
/* Manage contension between rocm_smi and amd_smi components. */
61+
int use_amd_smi = 0;
62+
#if defined(DEFAULT_TO_AMD_SMI)
63+
use_amd_smi = 1;
64+
#endif
65+
#if defined(DEFAULT_TO_ROCM_SMI)
66+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
67+
if (disabledComps != NULL) {
68+
char *penv = strdup(disabledComps);
69+
char *p;
70+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
71+
if(!strcmp(p, "rocm_smi")) use_amd_smi = 1;
72+
}
73+
free(penv);
74+
} else {
75+
SUBDBG("amd_smi: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
76+
}
77+
#endif
78+
79+
int papi_errno;
80+
if (use_amd_smi) {
81+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN,
82+
"Not initialized. Access an AMD SMI event to initialize.");
83+
papi_errno = PAPI_EDELAY_INIT;
84+
_amd_smi_vector.cmp_info.disabled = papi_errno;
85+
return papi_errno;
86+
} else {
87+
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN,
88+
"Not active while rocm_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm_smi' to override.");
89+
papi_errno = PAPI_ECOMBO;
90+
_amd_smi_vector.cmp_info.disabled = papi_errno;
91+
return papi_errno;
92+
}
6393

64-
return PAPI_EDELAY_INIT;
6594
}
6695

6796
static int evt_get_count(int *count) {
@@ -93,8 +122,7 @@ static int _amd_smi_init_private(void) {
93122
if (!error_str || !error_str[0])
94123
error_str = "AMD SMI component initialization failed";
95124
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason,
96-
sizeof _amd_smi_vector.cmp_info.disabled_reason, "%s",
97-
error_str);
125+
PAPI_HUGE_STR_LEN, "%s", error_str);
98126
goto fn_fail;
99127
}
100128

src/components/rocm/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ setting the ROCP\_TOOL\_LIB to the PAPI library as follows:
7979
Please instead use the [`rocp_sdk`](https://github.com/icl-utk-edu/papi/blob/master/src/components/rocp_sdk/README.md) component.
8080

8181
* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm` and `rocp_sdk`.
82+
If both components are configured, then `rocm` will be active by default for ROCm < 6.3.2; `rocp_sdk` will be active by default for ROCm >= 6.3.2.
83+
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm` when `rocm` is active by default, or
84+
`export PAPI_DISABLE_COMPONENTS=rocp_sdk` when `rocp_sdk` is active by default.
8285

8386
* For ROCm >= 6.2.0, the environment variable `AQLPROFILE_READ_API` should be set to 0 for intercept mode and 1 (or unset) for sampling mode.
8487
Otherwise, counter values in intercept mode will return 0. See PAPI Issue #457 for more details.

src/components/rocm/rocm.c

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -133,25 +133,58 @@ rocm_init_component(int cid)
133133
_rocm_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
134134
SUBDBG("ENTER: cid: %d\n", cid);
135135

136-
int papi_errno = rocd_init_environment();
137-
if (papi_errno != PAPI_OK) {
138-
_rocm_vector.cmp_info.initialized = 1;
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocm = 0;
138+
#if defined(DEFAULT_TO_ROCM)
139+
use_rocm = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCP_SDK)
142+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (disabledComps != NULL) {
144+
char *penv = strdup(disabledComps);
145+
char *p;
146+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
147+
if(!strcmp(p, "rocp_sdk")) use_rocm = 1;
148+
}
149+
free(penv);
150+
} else {
151+
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
152+
}
153+
#endif
154+
155+
int papi_errno, expect;
156+
if (use_rocm) {
157+
papi_errno = rocd_init_environment();
158+
if (papi_errno != PAPI_OK) {
159+
_rocm_vector.cmp_info.initialized = 1;
160+
_rocm_vector.cmp_info.disabled = papi_errno;
161+
const char *err_string;
162+
rocd_err_get_last(&err_string);
163+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
164+
PAPI_HUGE_STR_LEN, "%s", err_string);
165+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
166+
SUBDBG("disabled_reason truncated");
167+
}
168+
goto fn_fail;
169+
}
170+
171+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
172+
"Not initialized. Access component events to initialize it.");
173+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
174+
SUBDBG("disabled_reason truncated");
175+
}
176+
papi_errno = PAPI_EDELAY_INIT;
139177
_rocm_vector.cmp_info.disabled = papi_errno;
140-
const char *err_string;
141-
rocd_err_get_last(&err_string);
142-
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
143-
PAPI_MAX_STR_LEN, "%s", err_string);
144-
if (expect > PAPI_MAX_STR_LEN) {
178+
} else {
179+
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
180+
"Not active while rocp_sdk component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocp_sdk' to override.");
181+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
145182
SUBDBG("disabled_reason truncated");
146183
}
147-
goto fn_fail;
184+
papi_errno = PAPI_ECOMBO;
185+
_rocm_vector.cmp_info.disabled = papi_errno;
148186
}
149187

150-
sprintf(_rocm_vector.cmp_info.disabled_reason,
151-
"Not initialized. Access component events to initialize it.");
152-
papi_errno = PAPI_EDELAY_INIT;
153-
_rocm_vector.cmp_info.disabled = papi_errno;
154-
155188
fn_exit:
156189
SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
157190
return papi_errno;
@@ -209,8 +242,8 @@ rocm_init_private(void)
209242
const char *err_string;
210243
rocd_err_get_last(&err_string);
211244
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
212-
PAPI_MAX_STR_LEN, "%s", err_string);
213-
if (expect > PAPI_MAX_STR_LEN) {
245+
PAPI_HUGE_STR_LEN, "%s", err_string);
246+
if (expect > PAPI_HUGE_STR_LEN) {
214247
SUBDBG("disabled_reason truncated");
215248
}
216249

@@ -222,8 +255,8 @@ rocm_init_private(void)
222255
_rocm_vector.cmp_info.num_native_events = count;
223256
_rocm_vector.cmp_info.num_cntrs = count;
224257
_rocm_vector.cmp_info.initialized = 1;
225-
int strLen = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", "");
226-
if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) {
258+
int strLen = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", "");
259+
if (strLen < 0 || strLen >= PAPI_HUGE_STR_LEN) {
227260
SUBDBG("Failed to fully write disabled_reason.\n");
228261
}
229262

src/components/rocm_smi/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ In both cases, the directory specified by `PAPI_ROCMSMI_ROOT` **must contain** t
4848

4949
## Known Limitations
5050

51+
* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm_smi` and `amd_smi`.
52+
If both components are configured, then `rocm_smi` will be active by default for ROCm < 6.4.0; `amd_smi` will be active by default for ROCm >= 6.4.0.
53+
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm_smi` when `rocm_smi` is active by default, or
54+
`export PAPI_DISABLE_COMPONENTS=amd_smi` when `amd_smi` is active by default.
55+
5156
* Only sets of metrics and events that can be gathered in a single pass are supported.
5257

5358
* Although AMD metrics may be floating point, all values are recast and returned as long long integers.

src/components/rocm_smi/linux-rocm-smi.c

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,45 @@ _rocm_smi_init_component(int cidx)
6868
_rocm_smi_vector.cmp_info.num_cntrs = -1;
6969
_rocm_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;
7070

71-
sprintf(_rocm_smi_vector.cmp_info.disabled_reason,
72-
"Not initialized. Access component events to initialize it.");
73-
_rocm_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
74-
75-
return PAPI_EDELAY_INIT;
71+
/* Manage contension between rocm_smi and amd_smi components. */
72+
int use_rocm_smi = 0;
73+
#if defined(DEFAULT_TO_ROCM_SMI)
74+
use_rocm_smi = 1;
75+
#endif
76+
#if defined(DEFAULT_TO_AMD_SMI)
77+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
78+
if (disabledComps != NULL) {
79+
char *penv = strdup(disabledComps);
80+
char *p;
81+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
82+
if(!strcmp(p, "amd_smi")) use_rocm_smi = 1;
83+
}
84+
free(penv);
85+
} else {
86+
SUBDBG("rocm_smi: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
87+
}
88+
#endif
89+
90+
int papi_errno, expect;
91+
if (use_rocm_smi) {
92+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
93+
"Not initialized. Access component events to initialize it.");
94+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
95+
SUBDBG("disabled_reason truncated");
96+
}
97+
papi_errno = PAPI_EDELAY_INIT;
98+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
99+
return papi_errno;
100+
} else {
101+
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
102+
"Not active while amd_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=amd_smi' to override.");
103+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
104+
SUBDBG("disabled_reason truncated");
105+
}
106+
papi_errno = PAPI_ECOMBO;
107+
_rocm_smi_vector.cmp_info.disabled = papi_errno;
108+
return papi_errno;
109+
}
76110
}
77111

78112
static int
@@ -108,7 +142,10 @@ _rocm_smi_init_private(void)
108142
_rocm_smi_vector.cmp_info.disabled = papi_errno;
109143
const char *error_str;
110144
rocs_err_get_last(&error_str);
111-
sprintf(_rocm_smi_vector.cmp_info.disabled_reason, "%s", error_str);
145+
int expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", error_str);
146+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
147+
SUBDBG("disabled_reason truncated");
148+
}
112149
goto fn_fail;
113150
}
114151

@@ -125,6 +162,12 @@ _rocm_smi_init_private(void)
125162

126163
fn_exit:
127164
_rocm_smi_vector.cmp_info.disabled = papi_errno;
165+
if(PAPI_OK == papi_errno) {
166+
int expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", "");
167+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
168+
SUBDBG("disabled_reason truncated");
169+
}
170+
}
128171
PAPI_unlock(COMPONENT_LOCK);
129172
return papi_errno;
130173
fn_fail:

src/components/rocp_sdk/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,9 @@ To see the ROCP\_SDK component's current supported hardware and software please
6565

6666
## Known Limitations
6767

68+
* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm` and `rocp_sdk`.
69+
If both components are configured, then `rocm` will be active by default for ROCm < 6.3.2; `rocp_sdk` will be active by default for ROCm >= 6.3.2.
70+
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm` when `rocm` is active by default, or
71+
`export PAPI_DISABLE_COMPONENTS=rocp_sdk` when `rocp_sdk` is active by default.
6872
* In dispatch mode, PAPI may read zeros if reading takes place immediately after the return of a GPU kernel. This is not a PAPI bug. It may occur because calls such as hipDeviceSynchronize() do not guarantee that ROCprofiler has been called and all counter buffers have been flushed. Therefore, it is recommended that the user code adds a delay between the return of a kernel and calls to PAPI_read(), PAPI_stop(), etc.
6973
* If an application is linked against the static PAPI library libpapi.a, then the application must call PAPI_library_init() before calling any hip routines (e.g. hipInit(), hipGetDeviceCount(), hipLaunchKernelGGL(), etc). If the application is linked against the dynamic library libpapi.so, then the order of operations does not matter.

src/components/rocp_sdk/rocp_sdk.c

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -133,23 +133,57 @@ rocp_sdk_init_component(int cid)
133133
_rocp_sdk_vector.cmp_info.num_cntrs = -1;
134134
_rocp_sdk_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
135135

136-
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
137-
// It is not critical, so if it fails to be set, we can safely ignore the error.
138-
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
136+
/* Manage contension between rocm and rocp_sdk components. */
137+
int use_rocp_sdk = 0;
138+
#if defined(DEFAULT_TO_ROCP_SDK)
139+
use_rocp_sdk = 1;
140+
#endif
141+
#if defined(DEFAULT_TO_ROCM)
142+
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
143+
if (disabledComps != NULL) {
144+
char *penv = strdup(disabledComps);
145+
char *p;
146+
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
147+
if(!strcmp(p, "rocm")) use_rocp_sdk = 1;
148+
}
149+
free(penv);
150+
} else {
151+
SUBDBG("rocp_sdk: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
152+
}
153+
#endif
139154

140-
int papi_errno = rocprofiler_sdk_init_pre();
141-
if (papi_errno != PAPI_OK) {
142-
_rocp_sdk_vector.cmp_info.initialized = 1;
155+
int papi_errno, expect;
156+
if( use_rocp_sdk) {
157+
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
158+
// It is not critical, so if it fails to be set, we can safely ignore the error.
159+
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
160+
161+
papi_errno = rocprofiler_sdk_init_pre();
162+
if (papi_errno != PAPI_OK) {
163+
_rocp_sdk_vector.cmp_info.initialized = 1;
164+
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
165+
const char *err_string;
166+
rocprofiler_sdk_err_get_last(&err_string);
167+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", err_string);
168+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
169+
SUBDBG("disabled_reason truncated");
170+
}
171+
return papi_errno;
172+
}
173+
174+
// This component needs to be fully initialized from the beginning,
175+
// because interleaving hip calls and PAPI calls leads to errors.
176+
return check_n_initialize();
177+
} else {
178+
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
179+
"Not active while rocm component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm' to override.");
180+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
181+
SUBDBG("disabled_reason truncated");
182+
}
183+
papi_errno = PAPI_ECOMBO;
143184
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
144-
const char *err_string;
145-
rocprofiler_sdk_err_get_last(&err_string);
146-
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
147185
return papi_errno;
148186
}
149-
150-
// This component needs to be fully initialized from the beginning,
151-
// because interleaving hip calls and PAPI calls leads to errors.
152-
return check_n_initialize();
153187
}
154188

155189
int
@@ -205,7 +239,10 @@ rocp_sdk_init_private(void)
205239
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
206240
const char *err_string;
207241
rocprofiler_sdk_err_get_last(&err_string);
208-
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
242+
int expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", err_string);
243+
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
244+
SUBDBG("disabled_reason truncated");
245+
}
209246
goto fn_fail;
210247
}
211248

0 commit comments

Comments
 (0)