Skip to content

Commit 55c25ec

Browse files
ApoKalipse-VbweltonBenjamin Welton
authored
[AFAR VII] rocprofiler_sample_device_counting_service return data as part of API call (ROCm#57) (#1220)
--------- Co-authored-by: Welton, Benjamin <Benjamin.Welton@amd.com> Co-authored-by: Benjamin Welton <bewelton@amd.com> Co-authored-by: Benjamin Welton <ben@amd.com>
1 parent aba2bc3 commit 55c25ec

File tree

8 files changed

+141
-54
lines changed

8 files changed

+141
-54
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
119119
- Changed naming of "dispatch profiling service" to a more descriptive "dispatch counting service". To convert existing tool or user code to the new names, the following sed can be used: `-type f -exec sed -i -e 's/dispatch_profile_counting_service/dispatch_counting_service/g' -e 's/dispatch_profile.h/dispatch_counting_service.h/g' -e 's/rocprofiler_profile_counting_dispatch_callback_t/rocprofiler_dispatch_counting_service_callback_t/g' -e 's/rocprofiler_profile_counting_dispatch_data_t/rocprofiler_dispatch_counting_service_data_t/g' -e 's/rocprofiler_profile_counting_dispatch_record_t/rocprofiler_dispatch_counting_service_record_t/g' {} +`
120120
- `FETCH_SIZE` metric on gfx94x now uses `TCC_BUBBLE` for 128B reads.
121121
- PMC dispatch-based counter collection serialization is now per-device instead of being global across all devices.
122+
- Added output return functionality to rocprofiler_sample_device_counting_service
123+
122124

123125
### Resolved issues
124126

samples/counter_collection/client.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
358358
// below to select the profile config to use when a kernel dispatch is
359359
// recieved.
360360
get_profile_cache().emplace(
361-
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"SQ_WAVES"}));
361+
agent.id.handle, build_profile_for_agent(agent.id, std::set<std::string>{"TCC_HIT"}));
362362
}
363363

364364
auto client_thread = rocprofiler_callback_thread_t{};

samples/counter_collection/device_counting.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,11 @@ tool_init(rocprofiler_client_finalize_t, void* user_data)
289289
rocprofiler_start_context(get_client_ctx());
290290
while(exit_toggle().load() == false)
291291
{
292-
rocprofiler_sample_device_counting_service(
293-
get_client_ctx(), {.value = count}, ROCPROFILER_COUNTER_FLAG_NONE);
292+
rocprofiler_sample_device_counting_service(get_client_ctx(),
293+
{.value = count},
294+
ROCPROFILER_COUNTER_FLAG_NONE,
295+
nullptr,
296+
nullptr);
294297
count++;
295298
std::this_thread::sleep_for(std::chrono::milliseconds(50));
296299
}

source/include/rocprofiler-sdk/device_counting_service.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,18 +106,28 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
106106
* @param [in] context_id context id
107107
* @param [in] user_data User supplied data, included in records outputted to buffer.
108108
* @param [in] flags Flags to specify how the counter data should be collected (defaults to sync).
109+
* @param [in/out] output_records Output records collected via sampling (output is also written to
110+
* buffer). Must be allocated by caller.
111+
* @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in
112+
* output_records. On exit, contains the number of actual records.
109113
* @return ::rocprofiler_status_t
110114
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or
111115
* the context is not configured for agent profiling.
112116
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_ERROR Returned if another operation is in progress (
113117
* start/stop ctx or another read).
114118
* @retval ::ROCPROFILER_STATUS_ERROR Returned if HSA has not been initialized yet.
119+
* @retval ::ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES Returned output_records is set but size is
120+
* too small to store results
115121
* @retval ::ROCPROFILER_STATUS_SUCCESS Returned if read request was successful.
122+
* @retval ::ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT Returned If ASYNC is being used while
123+
* output_records is not null.
116124
*/
117125
rocprofiler_status_t
118-
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
119-
rocprofiler_user_data_t user_data,
120-
rocprofiler_counter_flag_t flags) ROCPROFILER_API;
126+
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
127+
rocprofiler_user_data_t user_data,
128+
rocprofiler_counter_flag_t flags,
129+
rocprofiler_record_counter_t* output_records,
130+
size_t* rec_count) ROCPROFILER_API;
121131

122132
/** @} */
123133

source/lib/rocprofiler-sdk/counters/device_counting.cpp

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@ agent_async_handler(hsa_signal_value_t /*signal_v*/, void* data)
156156
{
157157
val.user_data = callback_data.user_data;
158158
val.agent_id = prof_config->agent->id;
159+
if(callback_data.cached_counters)
160+
{
161+
callback_data.cached_counters->push_back(val);
162+
}
159163
buf->emplace(
160164
ROCPROFILER_BUFFER_CATEGORY_COUNTERS, ROCPROFILER_COUNTER_RECORD_VALUE, val);
161165
}
@@ -253,9 +257,10 @@ init_callback_data(rocprofiler::counters::agent_callback_data& callback_data,
253257
* and trigger the async handler manually.
254258
*/
255259
rocprofiler_status_t
256-
read_agent_ctx(const context::context* ctx,
257-
rocprofiler_user_data_t user_data,
258-
rocprofiler_counter_flag_t flags)
260+
read_agent_ctx(const context::context* ctx,
261+
rocprofiler_user_data_t user_data,
262+
rocprofiler_counter_flag_t flags,
263+
std::vector<rocprofiler_record_counter_t>* out_counters)
259264
{
260265
rocprofiler_status_t status = ROCPROFILER_STATUS_SUCCESS;
261266
if(!ctx->device_counter_collection)
@@ -282,6 +287,18 @@ read_agent_ctx(const context::context* ctx,
282287

283288
for(auto& callback_data : agent_ctx.agent_data)
284289
{
290+
auto wait_if_sync = [&]() {
291+
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
292+
{
293+
// Wait for any inprogress samples to complete before returning
294+
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
295+
HSA_SIGNAL_CONDITION_EQ,
296+
1,
297+
UINT64_MAX,
298+
HSA_WAIT_STATE_ACTIVE);
299+
}
300+
};
301+
285302
if(!callback_data.profile || !callback_data.set_profile) continue;
286303
const auto* agent = agent::get_agent_cache(callback_data.profile->agent);
287304

@@ -295,23 +312,19 @@ read_agent_ctx(const context::context* ctx,
295312
// No AQL packet, nothing to do here.
296313
if(!callback_data.packet) continue;
297314

315+
wait_if_sync();
316+
317+
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0)
318+
callback_data.cached_counters = out_counters;
319+
298320
// If we have no hardware counters but a packet. The caller is expecting
299321
// non-hardware based counter values to be returned. We can skip packet injection
300322
// and trigger the async handler directly
301323
if(callback_data.profile->reqired_hw_counters.empty())
302324
{
303325
callback_data.user_data = user_data;
304326
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, -1);
305-
// Wait for the barrier/read packet to complete
306-
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
307-
{
308-
// Wait for any inprogress samples to complete before returning
309-
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
310-
HSA_SIGNAL_CONDITION_EQ,
311-
1,
312-
UINT64_MAX,
313-
HSA_WAIT_STATE_ACTIVE);
314-
}
327+
wait_if_sync();
315328
continue;
316329
}
317330

@@ -334,17 +347,8 @@ read_agent_ctx(const context::context* ctx,
334347
hsa::get_core_table()->hsa_signal_store_relaxed_fn(callback_data.completion, 0);
335348
callback_data.user_data = user_data;
336349
submitPacket(agent->profile_queue(), &barrier.barrier_and);
337-
338-
// Wait for the barrier/read packet to complete
339-
if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
340-
{
341-
// Wait for any inprogress samples to complete before returning
342-
hsa::get_core_table()->hsa_signal_wait_relaxed_fn(callback_data.completion,
343-
HSA_SIGNAL_CONDITION_EQ,
344-
1,
345-
UINT64_MAX,
346-
HSA_WAIT_STATE_ACTIVE);
347-
}
350+
wait_if_sync();
351+
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) == 0) callback_data.cached_counters = nullptr;
348352
}
349353

350354
agent_ctx.status.exchange(rocprofiler::context::device_counting_service::state::ENABLED);

source/lib/rocprofiler-sdk/counters/device_counting.hpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <rocprofiler-sdk/fwd.h>
2828
#include <rocprofiler-sdk/hsa.h>
2929
#include <rocprofiler-sdk/rocprofiler.h>
30+
#include <cstddef>
3031

3132
namespace rocprofiler
3233
{
@@ -55,11 +56,12 @@ struct agent_callback_data
5556
rocprofiler_user_data_t user_data = {.value = 0};
5657
rocprofiler_user_data_t callback_data = {.value = 0};
5758

58-
std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
59-
rocprofiler_agent_id_t agent_id = {.handle = 0};
60-
rocprofiler_device_counting_service_callback_t cb = nullptr;
61-
rocprofiler_buffer_id_t buffer = {.handle = 0};
62-
bool set_profile = false;
59+
std::shared_ptr<rocprofiler::counters::profile_config> profile = {};
60+
rocprofiler_agent_id_t agent_id = {.handle = 0};
61+
rocprofiler_device_counting_service_callback_t cb = nullptr;
62+
rocprofiler_buffer_id_t buffer = {.handle = 0};
63+
bool set_profile = false;
64+
std::vector<rocprofiler_record_counter_t>* cached_counters = nullptr;
6365

6466
agent_callback_data() = default;
6567
agent_callback_data(agent_callback_data&& rhs) noexcept
@@ -115,9 +117,10 @@ stop_agent_ctx(const context::context* ctx);
115117
// read calls are not allowed in ASYNC mode and will result in
116118
// this call waiting for the previous sample to complete.
117119
rocprofiler_status_t
118-
read_agent_ctx(const context::context* ctx,
119-
rocprofiler_user_data_t user_data,
120-
rocprofiler_counter_flag_t flags);
120+
read_agent_ctx(const context::context* ctx,
121+
rocprofiler_user_data_t user_data,
122+
rocprofiler_counter_flag_t flags,
123+
std::vector<rocprofiler_record_counter_t>* out_counters);
121124

122125
uint64_t
123126
submitPacket(hsa_queue_t* queue, const void* packet);

source/lib/rocprofiler-sdk/counters/tests/device_counting.cpp

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,10 @@ test_init()
108108
hsa::get_queue_controller()->init(get_api_table(), get_ext_table());
109109
}
110110

111-
std::vector<rocprofiler_record_counter_t>&
111+
common::Synchronized<std::vector<rocprofiler_record_counter_t>>&
112112
global_recs()
113113
{
114-
static std::vector<rocprofiler_record_counter_t> recs;
114+
static common::Synchronized<std::vector<rocprofiler_record_counter_t>> recs;
115115
return recs;
116116
}
117117

@@ -146,7 +146,7 @@ check_output_created(rocprofiler_context_id_t,
146146
}
147147
found_value = record->user_data.value;
148148
// ROCP_ERROR << fmt::format("Found counter value: {}", record->counter_value);
149-
global_recs().push_back(*record);
149+
global_recs().wlock([&](auto& data) { data.push_back(*record); });
150150
}
151151
}
152152

@@ -319,6 +319,7 @@ class device_counting_service_test : public ::testing::Test
319319
size_t track_metric = 0;
320320
for(auto& metric : metrics)
321321
{
322+
std::vector<rocprofiler_record_counter_t> output_records(10000);
322323
// global_recs().clear();
323324
track_metric++;
324325
ROCP_ERROR << "Testing metric " << metric.name();
@@ -402,9 +403,23 @@ class device_counting_service_test : public ::testing::Test
402403
HSA_WAIT_STATE_BLOCKED);
403404

404405
// Sample the counting service.
405-
ROCPROFILER_CALL(
406-
rocprofiler_sample_device_counting_service(ctx, {.value = track_metric}, flags),
407-
"Could not sample");
406+
407+
if(flags == ROCPROFILER_COUNTER_FLAG_ASYNC)
408+
{
409+
ROCPROFILER_CALL(rocprofiler_sample_device_counting_service(
410+
ctx, {.value = track_metric}, flags, nullptr, nullptr),
411+
"Could not sample");
412+
}
413+
else
414+
{
415+
global_recs().wlock([&](auto& _data) { _data.clear(); });
416+
size_t out_count = output_records.size();
417+
ROCPROFILER_CALL(
418+
rocprofiler_sample_device_counting_service(
419+
ctx, {.value = track_metric}, flags, output_records.data(), &out_count),
420+
"Could not sample");
421+
output_records.resize(out_count);
422+
}
408423
ROCPROFILER_CALL(rocprofiler_stop_context(ctx), "Could not stop context");
409424
rocprofiler_flush_buffer(opt_buff_id);
410425

@@ -417,6 +432,27 @@ class device_counting_service_test : public ::testing::Test
417432
{
418433
ROCP_FATAL << "Failed to get data for " << metric.name();
419434
}
435+
else if(flags != ROCPROFILER_COUNTER_FLAG_ASYNC)
436+
{
437+
auto recs_local = global_recs().rlock([](const auto& data) { return data; });
438+
439+
if(recs_local.size() != output_records.size())
440+
{
441+
ROCP_FATAL << "Output size does not match: " << recs_local.size() << " "
442+
<< output_records.size();
443+
}
444+
if(!std::equal(recs_local.begin(),
445+
recs_local.end(),
446+
output_records.begin(),
447+
[](const auto& a, const auto& b) {
448+
return a.id == b.id && a.counter_value == b.counter_value &&
449+
a.dispatch_id == b.dispatch_id &&
450+
a.agent_id.handle == b.agent_id.handle;
451+
}))
452+
{
453+
ROCP_FATAL << "Output does not match between buffer and callback";
454+
}
455+
}
420456
}
421457
hsa_signal_destroy(completion_signal);
422458
hsa_signal_destroy(found_data);
@@ -599,9 +635,10 @@ TEST_F(device_counting_service_test, async_counters) { test_run(ROCPROFILER_COUN
599635
TEST_F(device_counting_service_test, sync_grbm_verify)
600636
{
601637
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GRBM_COUNT"}, 50000);
602-
ROCP_ERROR << global_recs().size();
638+
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
639+
ROCP_ERROR << local_recs.size();
603640

604-
for(const auto& val : global_recs())
641+
for(const auto& val : local_recs)
605642
{
606643
rocprofiler_counter_id_t id;
607644
rocprofiler_query_record_counter_id(val.id, &id);
@@ -615,9 +652,10 @@ TEST_F(device_counting_service_test, sync_grbm_verify)
615652
TEST_F(device_counting_service_test, sync_gpu_util_verify)
616653
{
617654
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"GPU_UTIL"}, 50000);
618-
ROCP_ERROR << global_recs().size();
655+
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
656+
ROCP_ERROR << local_recs.size();
619657

620-
for(const auto& val : global_recs())
658+
for(const auto& val : local_recs)
621659
{
622660
rocprofiler_counter_id_t id;
623661
rocprofiler_query_record_counter_id(val.id, &id);
@@ -631,9 +669,10 @@ TEST_F(device_counting_service_test, sync_gpu_util_verify)
631669
TEST_F(device_counting_service_test, sync_sq_waves_verify)
632670
{
633671
test_run(ROCPROFILER_COUNTER_FLAG_NONE, {"SQ_WAVES_sum"}, 50000);
634-
ROCP_ERROR << global_recs().size();
672+
auto local_recs = global_recs().rlock([](const auto& data) { return data; });
673+
ROCP_ERROR << local_recs.size();
635674

636-
for(const auto& val : global_recs())
675+
for(const auto& val : local_recs)
637676
{
638677
rocprofiler_counter_id_t id;
639678
rocprofiler_query_record_counter_id(val.id, &id);

source/lib/rocprofiler-sdk/device_counting_service.cpp

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#include "lib/rocprofiler-sdk/counters/device_counting.hpp"
2828
#include "rocprofiler-sdk/fwd.h"
2929

30+
#include <string.h>
31+
3032
extern "C" {
3133
rocprofiler_status_t
3234
rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_id,
@@ -40,11 +42,35 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
4042
}
4143

4244
rocprofiler_status_t
43-
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
44-
rocprofiler_user_data_t user_data,
45-
rocprofiler_counter_flag_t flags)
45+
rocprofiler_sample_device_counting_service(rocprofiler_context_id_t context_id,
46+
rocprofiler_user_data_t user_data,
47+
rocprofiler_counter_flag_t flags,
48+
rocprofiler_record_counter_t* output_records,
49+
size_t* rec_count)
4650
{
51+
if(output_records != nullptr)
52+
{
53+
if((flags & ROCPROFILER_COUNTER_FLAG_ASYNC) != 0)
54+
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
55+
CHECK(rec_count);
56+
auto recs = std::vector<rocprofiler_record_counter_t>{};
57+
auto status = rocprofiler::counters::read_agent_ctx(
58+
rocprofiler::context::get_registered_context(context_id), user_data, flags, &recs);
59+
if(status == ROCPROFILER_STATUS_SUCCESS)
60+
{
61+
if(recs.size() > *rec_count)
62+
{
63+
*rec_count = recs.size();
64+
return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
65+
}
66+
*rec_count = recs.size();
67+
std::memcpy(
68+
output_records, recs.data(), sizeof(rocprofiler_record_counter_t) * recs.size());
69+
}
70+
return status;
71+
}
72+
4773
return rocprofiler::counters::read_agent_ctx(
48-
rocprofiler::context::get_registered_context(context_id), user_data, flags);
74+
rocprofiler::context::get_registered_context(context_id), user_data, flags, nullptr);
4975
}
5076
}

0 commit comments

Comments
 (0)