Skip to content

Commit efcb2fb

Browse files
committed
Add SPM AQL packet construction and HSA queue integration
1 parent 63509bc commit efcb2fb

File tree

6 files changed

+397
-6
lines changed

6 files changed

+397
-6
lines changed

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.cpp

Lines changed: 153 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ namespace rocprofiler
4242
{
4343
namespace aql
4444
{
45+
struct AQLProfileMetric
46+
{
47+
counters::Metric metric;
48+
std::vector<aqlprofile_pmc_event_t> instances;
49+
std::vector<aqlprofile_pmc_event_t> events;
50+
};
51+
4552
CounterPacketConstruct::CounterPacketConstruct(rocprofiler_agent_id_t agent,
4653
const std::vector<counters::Metric>& metrics)
4754
: _agent(agent)
@@ -50,7 +57,7 @@ CounterPacketConstruct::CounterPacketConstruct(rocprofiler_agent_id_t
5057
// for the counter.
5158
for(const auto& x : metrics)
5259
{
53-
auto query_info = get_query_info(_agent, x);
60+
auto query_info = get_query_info(_agent, x.block(), x.name());
5461
_metrics.emplace_back().metric = x;
5562
uint64_t event_id = 0;
5663
if(!x.event().empty()) event_id = std::stoul(x.event(), nullptr);
@@ -278,5 +285,150 @@ CounterPacketConstruct::can_collect()
278285
}
279286
return ROCPROFILER_STATUS_SUCCESS;
280287
}
288+
289+
/** @brief Constructs the packet using the contained input parameters.
290+
* Writes into ID map and spm descriptor used to decode SPM data
291+
*/
292+
std::unique_ptr<hsa::SPMPacket>
293+
spm_construct_packet(const rocprofiler_agent_id_t agent_id,
294+
const std::vector<counters::Metric>& metrics,
295+
double sample_freq,
296+
uint64_t buffer_size,
297+
uint64_t timeout)
298+
{
299+
auto events = std::vector<aqlprofile_pmc_event_t>{};
300+
auto params = std::vector<aqlprofile_spm_parameter_t>{};
301+
auto id_map = std::vector<spm::spm_counter_instance_t>{};
302+
303+
const auto* agent = CHECK_NOTNULL(rocprofiler::agent::get_agent(agent_id));
304+
const auto* aql_cache = CHECK_NOTNULL(rocprofiler::agent::get_agent_cache(agent));
305+
auto pool = std::make_shared<hsa::SPMMemoryPool>(
306+
*aql_cache, *hsa::get_amd_ext_table(), hsa::get_core_table()->hsa_memory_copy_fn);
307+
const auto* aql_agent = rocprofiler::agent::get_aql_agent(agent->id);
308+
309+
const double sclk_freq = agent->max_engine_clk_fcompute * 1E9; // GHz
310+
const size_t sclk_period = static_cast<size_t>(std::roundf(sclk_freq / ((sample_freq) *1E9)));
311+
312+
params.push_back({AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE, buffer_size * 1024});
313+
params.push_back({AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, sclk_period});
314+
params.push_back({AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT, timeout});
315+
316+
for(const auto& metric : metrics)
317+
{
318+
auto query_info = get_query_info(agent_id, metric.block(), metric.name());
319+
320+
for(unsigned block_index = 0; block_index < query_info.instance_count; ++block_index)
321+
{
322+
auto event = aqlprofile_pmc_event_t{
323+
.block_index = block_index,
324+
.event_id =
325+
static_cast<uint32_t>(std::stoul(metric.event().c_str(), nullptr) & 0xFFFFFFFF),
326+
.flags = aqlprofile_pmc_event_flags_t{metric.flags()},
327+
.block_name = static_cast<hsa_ven_amd_aqlprofile_block_name_t>(query_info.id)};
328+
329+
events.push_back(event);
330+
id_map.push_back({rocprofiler_counter_id_t{.handle = metric.id()}, block_index});
331+
}
332+
}
333+
334+
aqlprofile_spm_profile_t profile{.aql_agent = *aql_agent,
335+
.hsa_agent = pool->gpu_agent,
336+
.events = events.data(),
337+
.event_count = events.size(),
338+
.parameters = params.data(),
339+
.parameter_count = params.size(),
340+
.reserved = 0,
341+
.alloc_cb = &(hsa::SPMMemoryPool::Alloc),
342+
.dealloc_cb = &(hsa::SPMMemoryPool::Free),
343+
.memcpy_cb = &(hsa::SPMMemoryPool::Copy),
344+
.userdata = pool.get()};
345+
346+
auto pkt = std::make_unique<hsa::SPMPacket>(*aql_agent, profile);
347+
ROCP_FATAL_IF(!pkt->valid()) << "SPM Packet creation failed";
348+
349+
pool->delete_packets_fn = pkt->sym.delete_packets_fn;
350+
pool->handle = pkt->handle;
351+
pkt->pool = std::move(pool);
352+
353+
pkt->spm_desc.size =
354+
sizeof(spm::spm_desc_v0_t) + id_map.size() * sizeof(id_map[0]) + pkt->aql_desc.size;
355+
356+
pkt->container_desc_data = std::make_shared<std::vector<char>>(pkt->spm_desc.size);
357+
pkt->spm_desc.data = pkt->container_desc_data->data();
358+
359+
auto* desc = static_cast<spm::spm_desc_v0_t*>(pkt->spm_desc.data);
360+
361+
*desc = spm::spm_desc_v0_t{};
362+
desc->aql_desc_size = pkt->aql_desc.size;
363+
desc->num_events = id_map.size();
364+
365+
std::memcpy(desc->aqlprofile_desc(), pkt->aql_desc.data, pkt->aql_desc.size);
366+
std::memcpy(desc->events(), id_map.data(), id_map.size() * sizeof(id_map[0]));
367+
368+
pkt->clear();
369+
return pkt;
370+
}
371+
372+
// Following the PMC check for now
373+
// ToDO: change this to SPM
374+
rocprofiler_status_t
375+
spm_can_collect(const rocprofiler_agent_id_t agent_id, const std::vector<counters::Metric>& metrics)
376+
{
377+
// Verify that the counters fit within harrdware limits
378+
auto counter_count =
379+
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t>{};
380+
auto max_allowed =
381+
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t>{};
382+
auto _metrics = std::vector<AQLProfileMetric>{};
383+
384+
for(const auto& metric : metrics)
385+
{
386+
auto query_info = get_query_info(agent_id, metric.block(), metric.name());
387+
_metrics.emplace_back().metric = metric;
388+
389+
auto event_id =
390+
static_cast<uint32_t>(std::stoul(metric.event().c_str(), nullptr) & 0xFFFFFFFF);
391+
392+
for(unsigned block_index = 0; block_index < query_info.instance_count; ++block_index)
393+
{
394+
_metrics.back().instances.push_back(
395+
{.block_index = block_index,
396+
.event_id = event_id,
397+
.flags = aqlprofile_pmc_event_flags_t{metric.flags()},
398+
.block_name = static_cast<hsa_ven_amd_aqlprofile_block_name_t>(query_info.id)});
399+
400+
_metrics.back().events.push_back(
401+
{.block_index = block_index,
402+
.event_id = event_id,
403+
.flags = aqlprofile_pmc_event_flags_t{metric.flags()},
404+
.block_name = static_cast<hsa_ven_amd_aqlprofile_block_name_t>(query_info.id)});
405+
}
406+
}
407+
408+
for(auto& metric : _metrics)
409+
{
410+
for(auto& instance : metric.events)
411+
{
412+
auto block_pair = std::make_pair(instance.block_name, instance.block_index);
413+
auto [iter, inserted] = counter_count.emplace(block_pair, 0);
414+
iter->second++;
415+
if(inserted)
416+
{
417+
max_allowed.emplace(block_pair, get_block_counters(agent_id, instance));
418+
}
419+
}
420+
}
421+
422+
// Check if the block count > max count
423+
for(auto& [block_name, count] : counter_count)
424+
{
425+
if(auto* max = CHECK_NOTNULL(common::get_val(max_allowed, block_name)); count > *max)
426+
{
427+
return ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT;
428+
}
429+
}
430+
return ROCPROFILER_STATUS_SUCCESS;
431+
}
432+
281433
} // namespace aql
282434
} // namespace rocprofiler

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
2727
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
2828
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
29+
#include "lib/rocprofiler-sdk/spm/decode.hpp"
2930
#include "lib/rocprofiler-sdk/thread_trace/core.hpp"
3031

3132
#include <rocprofiler-sdk/fwd.h>
@@ -126,5 +127,16 @@ class ThreadTraceAQLPacketFactory
126127
hsa::TraceMemoryPool tracepool;
127128
};
128129

130+
std::unique_ptr<hsa::SPMPacket>
131+
spm_construct_packet(const rocprofiler_agent_id_t agent_id,
132+
const std::vector<counters::Metric>& metrics,
133+
double sample_freq,
134+
uint64_t buffer_size,
135+
uint64_t timeout);
136+
137+
rocprofiler_status_t
138+
spm_can_collect(const rocprofiler_agent_id_t agent_id,
139+
const std::vector<counters::Metric>& metrics);
140+
129141
} // namespace aql
130142
} // namespace rocprofiler

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
// THE SOFTWARE.
2222

2323
#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp"
24+
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
25+
2426
#include <fmt/core.h>
2527
#include <cstdlib>
2628
#include <iostream>
@@ -242,5 +244,136 @@ CodeobjMarkerAQLPacket::CodeobjMarkerAQLPacket(const TraceMemoryPool& _tracepool
242244
clear();
243245
}
244246

247+
SPMMemoryPool::SPMMemoryPool(const AgentCache& agent, const AmdExtTable& ext, copy_fn_t copy_fn)
248+
{
249+
allocate_fn = ext.hsa_amd_memory_pool_allocate_fn;
250+
allow_access_fn = ext.hsa_amd_agents_allow_access_fn;
251+
free_fn = ext.hsa_amd_memory_pool_free_fn;
252+
fill_fn = ext.hsa_amd_memory_fill_fn;
253+
api_copy_fn = copy_fn;
254+
255+
gpu_agent = agent.get_hsa_agent();
256+
cpu_pool_ = agent.cpu_pool();
257+
gpu_pool_ = agent.gpu_pool();
258+
kernarg_pool_ = agent.kernarg_pool();
259+
}
260+
261+
void
262+
SPMMemoryPool::Free(void* ptr, void* data)
263+
{
264+
if(ptr == nullptr) return;
265+
auto* pool = reinterpret_cast<SPMMemoryPool*>(data);
266+
267+
ROCP_FATAL_IF(!pool || !pool->free_fn) << "Unable to deallocate from HSA memory pool";
268+
pool->free_fn(ptr);
269+
}
270+
271+
hsa_status_t
272+
SPMMemoryPool::Copy(void* dst, const void* src, size_t size, void* data)
273+
{
274+
if(size == 0) return HSA_STATUS_SUCCESS;
275+
auto* pool = reinterpret_cast<SPMMemoryPool*>(data);
276+
ROCP_FATAL_IF(!pool || !pool->api_copy_fn) << "Unable to copy HSA memory";
277+
278+
return pool->api_copy_fn(dst, src, size);
279+
}
280+
281+
hsa_status_t
282+
SPMMemoryPool::Alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t flags, void* data)
283+
{
284+
hsa_status_t status = HSA_STATUS_ERROR;
285+
286+
if(size == 0)
287+
{
288+
if(ptr != nullptr) *ptr = nullptr;
289+
return HSA_STATUS_SUCCESS;
290+
}
291+
if(!data) return HSA_STATUS_ERROR;
292+
293+
auto& pool = *reinterpret_cast<SPMMemoryPool*>(data);
294+
if(!pool.allocate_fn || !pool.free_fn || !pool.allow_access_fn) return HSA_STATUS_ERROR;
295+
296+
if(flags.host_access)
297+
status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
298+
else
299+
status =
300+
pool.allocate_fn(pool.kernarg_pool_, size, hsa_amd_memory_pool_executable_flag, ptr);
301+
302+
if(status == HSA_STATUS_SUCCESS)
303+
status = pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr);
304+
if(status == HSA_STATUS_SUCCESS) status = pool.fill_fn(*ptr, 0u, size / sizeof(uint32_t));
305+
306+
return status;
307+
}
308+
309+
SPMPacket::SPMPacket(aqlprofile_agent_handle_t aql_agent, aqlprofile_spm_profile_t profile)
310+
: agent(aql_agent)
311+
, sym()
312+
{
313+
if(!sym.valid()) return;
314+
auto status = sym.create_packets_fn(&handle, &aql_desc, &packets, profile, 0);
315+
316+
if(status == HSA_STATUS_ERROR_INVALID_AGENT) return;
317+
318+
packets.start_packet.header = VENDOR_BIT | BARRIER_BIT;
319+
packets.stop_packet.header = VENDOR_BIT | BARRIER_BIT;
320+
packets.start_packet.completion_signal = hsa_signal_t{.handle = 0};
321+
packets.stop_packet.completion_signal = hsa_signal_t{.handle = 0};
322+
323+
status = sym.spm_query_fn(aql_desc, AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE, &spm_desc.seg_size);
324+
if(status != HSA_STATUS_SUCCESS) return;
325+
status = sym.spm_query_fn(aql_desc, AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC, &spm_desc.buffer_num);
326+
if(status != HSA_STATUS_SUCCESS) return;
327+
328+
is_valid = true;
329+
empty = false;
330+
}
331+
332+
void
333+
SPMPacket::populate_before()
334+
{
335+
hsa_barrier_and_packet_t barrier{};
336+
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
337+
barrier.header |= BARRIER_BIT;
338+
339+
before_krn_barrier_pkt.push_back(barrier);
340+
before_krn_barrier_pkt.push_back(barrier);
341+
before_krn_pkt.push_back(packets.start_packet);
342+
};
343+
344+
void
345+
SPMPacket::populate_after()
346+
{
347+
after_krn_pkt.push_back(packets.stop_packet);
348+
};
349+
350+
void
351+
SPMPacket::kfd_start()
352+
{
353+
ROCP_FATAL_IF(!handle.handle) << "Attempt at starting SPM with unitialized packet!";
354+
355+
if(running.exchange(true))
356+
{
357+
ROCP_ERROR << "Double call to KFD start!";
358+
return;
359+
}
360+
361+
auto status = sym.spm_start_fn(this->handle, spm::aql_data_callback, this);
362+
ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS) << "Unable to acquire KFD thread";
363+
}
364+
365+
void
366+
SPMPacket::kfd_stop()
367+
{
368+
if(running.exchange(false))
369+
sym.spm_stop_fn(this->handle);
370+
else
371+
ROCP_WARNING << "Double call to KFD stop!";
372+
}
373+
374+
SPMPacket::~SPMPacket()
375+
{
376+
if(running.exchange(false) && sym.valid()) sym.spm_stop_fn(this->handle);
377+
}
245378
} // namespace hsa
246379
} // namespace rocprofiler

0 commit comments

Comments
 (0)