Skip to content

Commit f5d3fd3

Browse files
bweltonBenjamin Welton
andauthored
Add support for device counter collection ioctl (ROCm#46) (#1222)
Add support for device counter colleciton ioctl Adds support for the device counter collection IOCTL. This IOCTL allows for device wide counters to be collected even if the queue is not intercepted by rocprofiler-sdk (required for system profilers). A test is also included which checks this behavior by creating a queue that does not have profiling enabled on it and checks to see if SQ counters can be read from it. Note: this test will be skipped if the KFD version does not contain this IOCTL. Right now the check is "soft" in that if the IOCTL is present and there is an error with permissions, rocprofiler will continue but will print an error stating that system wide device profiling and collected counter values may be degraded. This is primarily to avoid breaking existing users (like PAPI) who may not need the IOCTL's capability and to give them time to update. Co-authored-by: Benjamin Welton <ben@amd.com>
1 parent 55c25ec commit f5d3fd3

File tree

12 files changed

+318
-60
lines changed

12 files changed

+318
-60
lines changed

source/include/rocprofiler-sdk/device_counting_service.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,9 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
106106
* @param [in] context_id context id
107107
* @param [in] user_data User supplied data, included in records outputted to buffer.
108108
* @param [in] flags Flags to specify how the counter data should be collected (defaults to sync).
109-
* @param [in/out] output_records Output records collected via sampling (output is also written to
109+
* @param [in] output_records Output records collected via sampling (output is also written to
110110
* buffer). Must be allocated by caller.
111-
* @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in
111+
* @param [in] rec_count On entry, this is the maximum number of records rocprof can store in
112112
* output_records. On exit, contains the number of actual records.
113113
* @return ::rocprofiler_status_t
114114
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or

source/include/rocprofiler-sdk/fwd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// MIT License
22
//
3-
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
44
//
55
// Permission is hereby granted, free of charge, to any person obtaining a copy
66
// of this software and associated documentation files (the "Software"), to deal
@@ -108,6 +108,7 @@ typedef enum // NOLINT(performance-enum-size)
108108
///< status code for more information.
109109
ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT, ///< Exceeds hardware limits for collection.
110110
ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED, ///< Agent HW architecture not supported.
111+
ROCPROFILER_STATUS_ERROR_PERMISSION_DENIED, ///< Permission denied.
111112
ROCPROFILER_STATUS_LAST,
112113
} rocprofiler_status_t;
113114

source/lib/rocprofiler-sdk/counters/CMakeLists.txt

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,25 @@
11
set(ROCPROFILER_LIB_COUNTERS_SOURCES
2-
metrics.cpp dimensions.cpp evaluate_ast.cpp core.cpp id_decode.cpp
3-
dispatch_handlers.cpp controller.cpp device_counting.cpp)
2+
metrics.cpp
3+
dimensions.cpp
4+
evaluate_ast.cpp
5+
core.cpp
6+
id_decode.cpp
7+
dispatch_handlers.cpp
8+
controller.cpp
9+
device_counting.cpp
10+
ioctl.cpp)
411
set(ROCPROFILER_LIB_COUNTERS_HEADERS
5-
metrics.hpp dimensions.hpp evaluate_ast.hpp core.hpp id_decode.hpp
6-
dispatch_handlers.hpp controller.hpp device_counting.hpp)
12+
metrics.hpp
13+
dimensions.hpp
14+
evaluate_ast.hpp
15+
core.hpp
16+
id_decode.hpp
17+
dispatch_handlers.hpp
18+
controller.hpp
19+
device_counting.hpp
20+
ioctl.hpp)
721
target_sources(rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_COUNTERS_SOURCES}
822
${ROCPROFILER_LIB_COUNTERS_HEADERS})
9-
1023
add_subdirectory(xml)
1124
add_subdirectory(parser)
1225
add_subdirectory(yaml)

source/lib/rocprofiler-sdk/counters/controller.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// MIT License
22
//
3-
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
44
//
55
// Permission is hereby granted, free of charge, to any person obtaining a copy
66
// of this software and associated documentation files (the "Software"), to deal
@@ -29,6 +29,7 @@
2929

3030
#include "lib/rocprofiler-sdk/buffer.hpp"
3131
#include "lib/rocprofiler-sdk/context/context.hpp"
32+
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
3233

3334
namespace rocprofiler
3435
{
@@ -97,6 +98,18 @@ CounterController::configure_agent_collection(rocprofiler_context_id_t context_i
9798
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
9899
}
99100

101+
if(counters::counter_collection_has_device_lock())
102+
{
103+
/**
104+
* Note: This should retrun if the lock fails to aquire in the future. However, this
105+
* is a change in the required permissions for rocprofiler and needs to be communicated
106+
* with partners before strict enforcement. If the required permissions are not obtained,
107+
* those profilers will function as they currently do (without any of the benefits of the
108+
* IOCTL).
109+
*/
110+
counters::counter_collection_device_lock(rocprofiler::agent::get_agent(agent_id), true);
111+
}
112+
100113
ctx.device_counter_collection->agent_data.emplace_back();
101114
ctx.device_counter_collection->agent_data.back().callback_data =
102115
rocprofiler_user_data_t{.ptr = user_data};

source/lib/rocprofiler-sdk/counters/controller.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
// MIT License
33
//
4-
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
4+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
55
//
66
// Permission is hereby granted, free of charge, to any person obtaining a copy
77
// of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@
2626
#include "lib/common/synchronized.hpp"
2727
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
2828
#include "lib/rocprofiler-sdk/counters/evaluate_ast.hpp"
29+
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
2930
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
3031

3132
#include <rocprofiler-sdk/agent.h>
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// MIT License
2+
//
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4+
//
5+
// Permission is hereby granted, free of charge, to any person obtaining a copy
6+
// of this software and associated documentation files (the "Software"), to deal
7+
// in the Software without restriction, including without limitation the rights
8+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
// copies of the Software, and to permit persons to whom the Software is
10+
// furnished to do so, subject to the following conditions:
11+
//
12+
// The above copyright notice and this permission notice shall be included in all
13+
// copies or substantial portions of the Software.
14+
//
15+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
// SOFTWARE.
22+
23+
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
24+
#include "lib/rocprofiler-sdk/details/kfd_ioctl.h"
25+
#include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.hpp"
26+
27+
#include <sys/ioctl.h>
28+
#include <cerrno>
29+
30+
namespace rocprofiler
31+
{
32+
namespace counters
33+
{
34+
bool
35+
counter_collection_has_device_lock()
36+
{
37+
kfd_ioctl_profiler_args args = {};
38+
args.op = KFD_IOC_PROFILER_VERSION;
39+
int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
40+
if(ret == 0)
41+
{
42+
return true;
43+
}
44+
return false;
45+
}
46+
47+
rocprofiler_status_t
48+
counter_collection_device_lock(const rocprofiler_agent_t* agent, bool all_queues)
49+
{
50+
CHECK(agent);
51+
kfd_ioctl_profiler_args args = {};
52+
args.op = KFD_IOC_PROFILER_PMC;
53+
args.pmc.gpu_id = agent->gpu_id;
54+
args.pmc.lock = 1;
55+
args.pmc.perfcount_enable = all_queues ? 1 : 0;
56+
57+
int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
58+
if(ret != 0)
59+
{
60+
switch(ret)
61+
{
62+
case -EBUSY:
63+
ROCP_WARNING << fmt::format(
64+
"Device {} has a profiler attached to it. PMC Counters may be inaccurate.",
65+
agent->id.handle);
66+
return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
67+
case -EPERM:
68+
ROCP_WARNING << fmt::format(
69+
"Device {} could not be locked for profiling due to lack of permissions "
70+
"(capability SYS_PERFMON). PMC Counters may be inaccurate and System Counter "
71+
"Collection will be degraded.",
72+
agent->id.handle);
73+
return ROCPROFILER_STATUS_ERROR_PERMISSION_DENIED;
74+
case -EINVAL:
75+
ROCP_WARNING << fmt::format(
76+
"Driver/Kernel version does not support locking device {}. PMC Counters may be "
77+
"inaccurate and System Counter Collection will be degraded.",
78+
agent->id.handle);
79+
return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
80+
default:
81+
ROCP_WARNING << fmt::format(
82+
"Failed to lock device {}. PMC Counters may be inaccurate and System Counter "
83+
"Collection will be degraded.",
84+
agent->id.handle);
85+
return ROCPROFILER_STATUS_ERROR;
86+
}
87+
}
88+
89+
return ROCPROFILER_STATUS_SUCCESS;
90+
}
91+
92+
// Not required now but may be useful in the future.
93+
// rocprofiler_status_t
94+
// counter_collection_device_unlock(const rocprofiler_agent_t* agent) {
95+
// CHECK(agent);
96+
// kfd_ioctl_profiler_args args = {};
97+
// args.op = KFD_IOC_PROFILER_PMC;
98+
// args.pmc.gpu_id = agent->gpu_id;
99+
// args.pmc.lock = 0;
100+
// args.pmc.perfcount_enable = 0;
101+
102+
// int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
103+
// if (ret != 0) {
104+
// switch (ret) {
105+
// case -EBUSY:
106+
// case -EPERM:
107+
// ROCP_WARNING << fmt::format("Could not unlock the device {}", agent->id.handle);
108+
// return ROCPROFILER_STATUS_ERROR;
109+
// case -EINVAL:
110+
// return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
111+
// default:
112+
// ROCP_WARNING << fmt::format("Could not unlock the device {}", agent->id.handle);
113+
// return ROCPROFILER_STATUS_ERROR;
114+
// }
115+
// }
116+
117+
// return ROCPROFILER_STATUS_SUCCESS;
118+
// }
119+
} // namespace counters
120+
} // namespace rocprofiler
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// MIT License
2+
//
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4+
//
5+
// Permission is hereby granted, free of charge, to any person obtaining a copy
6+
// of this software and associated documentation files (the "Software"), to deal
7+
// in the Software without restriction, including without limitation the rights
8+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
// copies of the Software, and to permit persons to whom the Software is
10+
// furnished to do so, subject to the following conditions:
11+
//
12+
// The above copyright notice and this permission notice shall be included in all
13+
// copies or substantial portions of the Software.
14+
//
15+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
// SOFTWARE.
22+
#pragma once
23+
24+
#include <rocprofiler-sdk/rocprofiler.h>
25+
26+
namespace rocprofiler
27+
{
28+
namespace counters
29+
{
30+
bool
31+
counter_collection_has_device_lock();
32+
33+
rocprofiler_status_t
34+
counter_collection_device_lock(const rocprofiler_agent_t* agent, bool all_queues);
35+
36+
} // namespace counters
37+
} // namespace rocprofiler

0 commit comments

Comments
 (0)