Skip to content

Commit 4cdb295

Browse files
authored
Add support for device counter collection ioctl (#46)
Add support for device counter colleciton ioctl Adds support for the device counter collection IOCTL. This IOCTL allows for device wide counters to be collected even if the queue is not intercepted by rocprofiler-sdk (required for system profilers). A test is also included which checks this behavior by creating a queue that does not have profiling enabled on it and checks to see if SQ counters can be read from it. Note: this test will be skipped if the KFD version does not contain this IOCTL. Right now the check is "soft" in that if the IOCTL is present and there is an error with permissions, rocprofiler will continue but will print an error stating that system wide device profiling and collected counter values may be degraded. This is primarily to avoid breaking existing users (like PAPI) who may not need the IOCTL's capability and to give them time to update. Co-authored-by: Benjamin Welton <ben@amd.com> [ROCm/rocprofiler-sdk commit: c574881]
1 parent 1878bc3 commit 4cdb295

File tree

12 files changed

+306
-62
lines changed

12 files changed

+306
-62
lines changed

source/include/rocprofiler-sdk/device_counting_service.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,9 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
106106
* @param [in] context_id context id
107107
* @param [in] user_data User supplied data, included in records outputted to buffer.
108108
* @param [in] flags Flags to specify how the counter data should be collected (defaults to sync).
109-
* @param [in/out] output_records Output records collected via sampling (output is also written to
109+
* @param [in] output_records Output records collected via sampling (output is also written to
110110
* buffer). Must be allocated by caller.
111-
* @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in
111+
* @param [in] rec_count On entry, this is the maximum number of records rocprof can store in
112112
* output_records. On exit, contains the number of actual records.
113113
* @return ::rocprofiler_status_t
114114
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or

source/include/rocprofiler-sdk/fwd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// MIT License
22
//
3-
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
44
//
55
// Permission is hereby granted, free of charge, to any person obtaining a copy
66
// of this software and associated documentation files (the "Software"), to deal
@@ -106,6 +106,7 @@ typedef enum // NOLINT(performance-enum-size)
106106
///< status code for more information.
107107
ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT, ///< Exceeds hardware limits for collection.
108108
ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED, ///< Agent HW architecture not supported.
109+
ROCPROFILER_STATUS_ERROR_PERMISSION_DENIED, ///< Permission denied.
109110
ROCPROFILER_STATUS_LAST,
110111
} rocprofiler_status_t;
111112

source/lib/rocprofiler-sdk/counters/CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ set(ROCPROFILER_LIB_COUNTERS_SOURCES
77
dispatch_handlers.cpp
88
sample_processing.cpp
99
controller.cpp
10-
device_counting.cpp)
10+
device_counting.cpp
11+
ioctl.cpp)
1112
set(ROCPROFILER_LIB_COUNTERS_HEADERS
1213
metrics.hpp
1314
dimensions.hpp
@@ -18,10 +19,10 @@ set(ROCPROFILER_LIB_COUNTERS_HEADERS
1819
sample_processing.hpp
1920
controller.hpp
2021
device_counting.hpp
21-
sample_consumer.hpp)
22+
sample_consumer.hpp
23+
ioctl.hpp)
2224
target_sources(rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_COUNTERS_SOURCES}
2325
${ROCPROFILER_LIB_COUNTERS_HEADERS})
24-
2526
add_subdirectory(xml)
2627
add_subdirectory(parser)
2728
add_subdirectory(yaml)

source/lib/rocprofiler-sdk/counters/controller.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// MIT License
22
//
3-
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
44
//
55
// Permission is hereby granted, free of charge, to any person obtaining a copy
66
// of this software and associated documentation files (the "Software"), to deal
@@ -29,6 +29,7 @@
2929

3030
#include "lib/rocprofiler-sdk/buffer.hpp"
3131
#include "lib/rocprofiler-sdk/context/context.hpp"
32+
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
3233

3334
namespace rocprofiler
3435
{
@@ -97,6 +98,18 @@ CounterController::configure_agent_collection(rocprofiler_context_id_t context_i
9798
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
9899
}
99100

101+
if(counters::counter_collection_has_device_lock())
102+
{
103+
/**
104+
* Note: This should retrun if the lock fails to aquire in the future. However, this
105+
* is a change in the required permissions for rocprofiler and needs to be communicated
106+
* with partners before strict enforcement. If the required permissions are not obtained,
107+
* those profilers will function as they currently do (without any of the benefits of the
108+
* IOCTL).
109+
*/
110+
counters::counter_collection_device_lock(rocprofiler::agent::get_agent(agent_id), true);
111+
}
112+
100113
ctx.device_counter_collection->agent_data.emplace_back();
101114
ctx.device_counter_collection->agent_data.back().callback_data =
102115
rocprofiler_user_data_t{.ptr = user_data};

source/lib/rocprofiler-sdk/counters/controller.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
// MIT License
33
//
4-
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
4+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
55
//
66
// Permission is hereby granted, free of charge, to any person obtaining a copy
77
// of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@
2626
#include "lib/common/synchronized.hpp"
2727
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
2828
#include "lib/rocprofiler-sdk/counters/evaluate_ast.hpp"
29+
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
2930
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
3031

3132
#include <rocprofiler-sdk/agent.h>
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// MIT License
2+
//
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4+
//
5+
// Permission is hereby granted, free of charge, to any person obtaining a copy
6+
// of this software and associated documentation files (the "Software"), to deal
7+
// in the Software without restriction, including without limitation the rights
8+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
// copies of the Software, and to permit persons to whom the Software is
10+
// furnished to do so, subject to the following conditions:
11+
//
12+
// The above copyright notice and this permission notice shall be included in all
13+
// copies or substantial portions of the Software.
14+
//
15+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
// SOFTWARE.
22+
23+
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
24+
#include "lib/rocprofiler-sdk/details/kfd_ioctl.h"
25+
#include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.hpp"
26+
27+
#include <sys/ioctl.h>
28+
#include <cerrno>
29+
30+
namespace rocprofiler
31+
{
32+
namespace counters
33+
{
34+
bool
35+
counter_collection_has_device_lock()
36+
{
37+
kfd_ioctl_profiler_args args = {};
38+
args.op = KFD_IOC_PROFILER_VERSION;
39+
int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
40+
if(ret == 0)
41+
{
42+
return true;
43+
}
44+
return false;
45+
}
46+
47+
rocprofiler_status_t
48+
counter_collection_device_lock(const rocprofiler_agent_t* agent, bool all_queues)
49+
{
50+
CHECK(agent);
51+
kfd_ioctl_profiler_args args = {};
52+
args.op = KFD_IOC_PROFILER_PMC;
53+
args.pmc.gpu_id = agent->gpu_id;
54+
args.pmc.lock = 1;
55+
args.pmc.perfcount_enable = all_queues ? 1 : 0;
56+
57+
int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
58+
if(ret != 0)
59+
{
60+
switch(ret)
61+
{
62+
case -EBUSY:
63+
ROCP_WARNING << fmt::format(
64+
"Device {} has a profiler attached to it. PMC Counters may be inaccurate.",
65+
agent->id.handle);
66+
return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
67+
case -EPERM:
68+
ROCP_WARNING << fmt::format(
69+
"Device {} could not be locked for profiling due to lack of permissions "
70+
"(capability SYS_PERFMON). PMC Counters may be inaccurate and System Counter "
71+
"Collection will be degraded.");
72+
return ROCPROFILER_STATUS_ERROR_PERMISSION_DENIED;
73+
case -EINVAL:
74+
ROCP_WARNING << fmt::format(
75+
"Driver/Kernel version does not support locking device {}. PMC Counters may be "
76+
"inaccurate and System Counter Collection will be degraded.",
77+
agent->id.handle);
78+
return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
79+
default:
80+
ROCP_WARNING << fmt::format(
81+
"Failed to lock device {}. PMC Counters may be inaccurate and System Counter "
82+
"Collection will be degraded.",
83+
agent->id.handle);
84+
return ROCPROFILER_STATUS_ERROR;
85+
}
86+
}
87+
88+
return ROCPROFILER_STATUS_SUCCESS;
89+
}
90+
91+
// Not required now but may be useful in the future.
92+
// rocprofiler_status_t
93+
// counter_collection_device_unlock(const rocprofiler_agent_t* agent) {
94+
// CHECK(agent);
95+
// kfd_ioctl_profiler_args args = {};
96+
// args.op = KFD_IOC_PROFILER_PMC;
97+
// args.pmc.gpu_id = agent->gpu_id;
98+
// args.pmc.lock = 0;
99+
// args.pmc.perfcount_enable = 0;
100+
101+
// int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
102+
// if (ret != 0) {
103+
// switch (ret) {
104+
// case -EBUSY:
105+
// case -EPERM:
106+
// ROCP_WARNING << fmt::format("Could not unlock the device {}", agent->id.handle);
107+
// return ROCPROFILER_STATUS_ERROR;
108+
// case -EINVAL:
109+
// return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
110+
// default:
111+
// ROCP_WARNING << fmt::format("Could not unlock the device {}", agent->id.handle);
112+
// return ROCPROFILER_STATUS_ERROR;
113+
// }
114+
// }
115+
116+
// return ROCPROFILER_STATUS_SUCCESS;
117+
// }
118+
} // namespace counters
119+
} // namespace rocprofiler
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// MIT License
2+
//
3+
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
4+
//
5+
// Permission is hereby granted, free of charge, to any person obtaining a copy
6+
// of this software and associated documentation files (the "Software"), to deal
7+
// in the Software without restriction, including without limitation the rights
8+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
// copies of the Software, and to permit persons to whom the Software is
10+
// furnished to do so, subject to the following conditions:
11+
//
12+
// The above copyright notice and this permission notice shall be included in all
13+
// copies or substantial portions of the Software.
14+
//
15+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
// SOFTWARE.
22+
#pragma once
23+
24+
#include <rocprofiler-sdk/rocprofiler.h>
25+
26+
namespace rocprofiler
27+
{
28+
namespace counters
29+
{
30+
bool
31+
counter_collection_has_device_lock();
32+
33+
rocprofiler_status_t
34+
counter_collection_device_lock(const rocprofiler_agent_t* agent, bool all_queues);
35+
36+
} // namespace counters
37+
} // namespace rocprofiler

0 commit comments

Comments
 (0)