Skip to content

Commit 926ec4a

Browse files
Baraldi, GiovanniApoKalipse-VCopilot
authored
Adding timestamp marker into SQTT buffer for gfx9 GPUs (#200)
* Adding TS Marker * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Add option to enable/disable RT --------- Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 51c5343 commit 926ec4a

File tree

7 files changed

+166
-24
lines changed

7 files changed

+166
-24
lines changed

src/core/aql_profile.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -787,7 +787,7 @@ PUBLIC_API hsa_status_t hsa_ven_amd_aqlprofile_att_marker(
787787
pm4_builder::CmdBuffer commands;
788788

789789
// Generate start commands
790-
auto status = sqtt_builder->InsertMarker(&commands, data, channel);
790+
auto status = sqtt_builder->InsertCodeobjMarker(&commands, data, channel);
791791
if (status != HSA_STATUS_SUCCESS) return status;
792792
aql_profile::descriptor_t& cmdbuffer = profile->command_buffer;
793793

src/core/include/aqlprofile-sdk/aql_profile_v2.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,12 +248,20 @@ typedef enum {
248248
hsa_status_t aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile,
249249
aqlprofile_pmc_info_type_t attribute, void* value);
250250

251+
typedef enum aqlprofile_att_parameter_rt_timestamp_t
252+
{
253+
AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DEFAULT = 0,
254+
AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_ENABLE,
255+
AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE
256+
} aqlprofile_att_parameter_rt_timestamp_t;
257+
251258
typedef enum aqlprofile_att_parameter_name_ext_t
252259
{
253260
/**
254261
* HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE + 1
255262
*/
256263
AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH = 11,
264+
AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP, // one of aqlprofile_att_parameter_rt_timestamp_t
257265
} aqlprofile_att_parameter_name_ext_t;
258266

259267
// Profile parameter object

src/core/threadtrace.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ typedef union {
7676

7777
inline att_header_packet_t getHeaderPacket(int SE, int CU, int SIMD) {
7878
att_header_packet_t header{.raw = 0};
79-
header.legacy_version = 0x11; // The thread trace viewer only sees gfx9 for 0x11
79+
header.legacy_version = 0x11;
8080
header.gfx9_version2 = 4;
8181
header.SEID = SE;
8282
header.DCU = CU;
@@ -126,7 +126,6 @@ hsa_status_t _internal_aqlprofile_att_iterate_data(aqlprofile_handle_t handle,
126126
size_t wptr_mask = sqttbuilder->GetWritePtrMask();
127127
size_t sample_size = (control_ptr[se_index].wptr & wptr_mask) * sqttbuilder->GetWritePtrBlk();
128128

129-
// GFX11 hardware bug workaround
130129
if (pm4_factory->GetGpuId() == aql_profile::GFX11_GPU_ID) {
131130
sample_size = sample_size - reinterpret_cast<uint64_t>(sample_ptr);
132131
sample_size &= (1ull << 29) - 1;
@@ -187,7 +186,8 @@ hsa_status_t _internal_aqlprofile_att_create_packets(
187186
trace_config.vmIdMask = 0;
188187
trace_config.simd_sel = 0xF;
189188
trace_config.perfMASK = ~0u;
190-
trace_config.se_mask = 0x11111111;
189+
trace_config.se_mask = 0x11;
190+
trace_config.enable_rt_timestamp = true;
191191

192192
const size_t se_number_total = pm4_factory->GetShaderEnginesNumber();
193193
uint64_t buffer_size = DEFAULT_TRACE_BUFFER_SIZE;
@@ -216,6 +216,9 @@ hsa_status_t _internal_aqlprofile_att_create_packets(
216216
case AQLPROFILE_ATT_PARAMETER_NAME_BUFFER_SIZE_HIGH:
217217
buffer_size = (buffer_size & UINT32_MAX) | (uint64_t(p->value) << 32); // High 32 bits
218218
break;
219+
case AQLPROFILE_ATT_PARAMETER_NAME_RT_TIMESTAMP:
220+
trace_config.enable_rt_timestamp = p->value != static_cast<uint32_t>(AQLPROFILE_ATT_PARAMETER_RT_TIMESTAMP_DISABLE);
221+
break;
219222
case HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK:
220223
trace_config.perfMASK = p->value;
221224
break;
@@ -275,7 +278,7 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker(
275278
hsa_ext_amd_aql_pm4_packet_t* packet, aqlprofile_handle_t* handle,
276279
aqlprofile_att_codeobj_data_t data, aqlprofile_memory_alloc_callback_t alloc_cb,
277280
aqlprofile_memory_dealloc_callback_t dealloc_cb, void* userdata) {
278-
static auto* mut = new std::shared_mutex{};
281+
static auto mut = new std::shared_mutex{};
279282
static auto* factory_cache = new std::map<uint64_t, aql_profile::Pm4Factory*>{};
280283

281284
auto _slk = std::shared_lock{*mut};
@@ -295,23 +298,23 @@ hsa_status_t _internal_aqlprofile_att_codeobj_marker(
295298
pm4_builder::CmdBuffer commands;
296299

297300
if (!data.isUnload) {
298-
sqttbuilder->InsertMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL);
299-
sqttbuilder->InsertMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL);
300-
sqttbuilder->InsertMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL);
301-
sqttbuilder->InsertMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL);
301+
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.addr), ATT_MARKER_ADDR_LO_CHANNEL);
302+
sqttbuilder->InsertCodeobjMarker(&commands, data.addr >> 32, ATT_MARKER_ADDR_HI_CHANNEL);
303+
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.size), ATT_MARKER_SIZE_LO_CHANNEL);
304+
sqttbuilder->InsertCodeobjMarker(&commands, data.size >> 32, ATT_MARKER_SIZE_HI_CHANNEL);
302305
}
303306

304307
aqlprofile_att_header_marker_t header{};
305308
header.bFromStart = data.fromStart;
306309
header.isUnload = data.isUnload;
307310

308311
if (data.id >= (1 << 30)) {
309-
sqttbuilder->InsertMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL);
310-
sqttbuilder->InsertMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL);
312+
sqttbuilder->InsertCodeobjMarker(&commands, uint32_t(data.id), ATT_MARKER_ID_LO_CHANNEL);
313+
sqttbuilder->InsertCodeobjMarker(&commands, data.id >> 32, ATT_MARKER_ID_HI_CHANNEL);
311314
} else
312315
header.legacy_id = data.id;
313316

314-
sqttbuilder->InsertMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL);
317+
sqttbuilder->InsertCodeobjMarker(&commands, header.raw, ATT_MARKER_HEADER_CHANNEL);
315318

316319
auto memorymgr = std::make_shared<CodeobjMemoryManager>(data.agent, alloc_cb, dealloc_cb,
317320
commands.Size(), userdata);

src/pm4/cmd_builder.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,13 @@ class CmdBuilder {
211211
/// @param cmdBuf command buffer to be appended with launch command
212212
virtual void BuildPrimeL2(CmdBuffer* cmdBuf, uint64_t addr) = 0;
213213

214+
/// @brief Generates RT packets into thread trace buffer (gfx9 only)
215+
/// @param cmdBuf command buffer to be appended with launch command
216+
/// @param dst where gpu clock data is r/w. Must persist during packet dispatch
217+
/// @param reg userdata register address
218+
/// @param header SQTT packet header
219+
virtual void BuildGPUClockPacket(CmdBuffer* cmdBuf, uint64_t* dst, const Register& reg, uint32_t header) {};
220+
214221
/// @brief Release resources used by CmdBuilder
215222
virtual ~CmdBuilder(){};
216223

src/pm4/gfx9_cmd_builder.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,83 @@ class Gfx9CmdBuilder : public CmdBuilder {
446446
uint32_t size, bool wait) {
447447
BuildCopyRegDataPacket(cmd, get_addr(reg), dst_addr, size, wait);
448448
}
449+
450+
std::array<uint32_t, 6> ClockRetrievePacket(uint64_t* dst)
451+
{
452+
auto addr = reinterpret_cast<uint64_t>(dst);
453+
454+
uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t));
455+
456+
uint32_t dword2 =
457+
PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__GPU_CLOCK_COUNT) |
458+
PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) |
459+
PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEMORY) |
460+
PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) |
461+
PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__WAIT_FOR_CONFIRMATION) |
462+
PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__64_BITS_OF_DATA);
463+
464+
uint32_t dword5 = PACKET3_COPY_DATA__DST_64B_ADDR_LO(addr >> 3);
465+
uint32_t dword6 = PACKET3_COPY_DATA__DST_ADDR_HI(High32(addr));
466+
467+
return {header, dword2, 0, 0, dword5, dword6};
468+
}
469+
470+
std::array<uint32_t, 6> UserdataLoPacket(uint32_t userdata_addr)
471+
{
472+
uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t));
473+
474+
uint32_t dword2 =
475+
PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__GPU_CLOCK_COUNT) |
476+
PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) |
477+
PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEM_MAPPED_REGISTER) |
478+
PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) |
479+
PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__WAIT_FOR_CONFIRMATION) |
480+
PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__32_BITS_OF_DATA);
481+
482+
return {header, dword2, 0, 0, userdata_addr, 0};
483+
}
484+
485+
std::array<uint32_t, 6> TraceDataMem32Packet(uint32_t userdata_addr, uint32_t* addr)
486+
{
487+
uint32_t header = MakePacket3Header(PACKET3_COPY_DATA, 6 * sizeof(uint32_t));
488+
uint32_t dword2 = PACKET3_COPY_DATA__SRC_SEL(PACKET3_COPY_DATA__SRC_SEL__MEMORY) |
489+
PACKET3_COPY_DATA__SRC_CACHE_POLICY(PACKET3_COPY_DATA__SRC_CACHE_POLICY__STREAM) |
490+
PACKET3_COPY_DATA__DST_SEL(PACKET3_COPY_DATA__DST_SEL__MEM_MAPPED_REGISTER) |
491+
PACKET3_COPY_DATA__DST_CACHE_POLICY(PACKET3_COPY_DATA__DST_CACHE_POLICY__STREAM) |
492+
PACKET3_COPY_DATA__WR_CONFIRM(PACKET3_COPY_DATA__WR_CONFIRM__DO_NOT_WAIT_FOR_CONFIRMATION) |
493+
PACKET3_COPY_DATA__COUNT_SEL(PACKET3_COPY_DATA__COUNT_SEL__32_BITS_OF_DATA);
494+
uint32_t dword3 = PACKET3_COPY_DATA__SRC_32B_ADDR_LO(PtrLow32(addr) >> 2);
495+
uint32_t dword4 = PACKET3_COPY_DATA__SRC_MEMTC_ADDR_HI(PtrHigh32(addr));
496+
497+
return {header, dword2, dword3, dword4, userdata_addr, 0};
498+
};
499+
500+
void BuildGPUClockPacket(CmdBuffer* cmdBuf, uint64_t* dst, const Register& userdata_addr, uint32_t header) override
501+
{
502+
uint32_t addr = get_addr(userdata_addr);
503+
504+
BuildWriteUConfigRegPacket(cmdBuf, addr, header);
505+
// Copy to dst
506+
{
507+
auto copy_data = ClockRetrievePacket(dst);
508+
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
509+
}
510+
// Copy low-bits to userdata
511+
{
512+
auto copy_data = TraceDataMem32Packet(addr, (uint32_t*)dst);
513+
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
514+
}
515+
// Copy hi-bits to userdata
516+
{
517+
auto copy_data = TraceDataMem32Packet(addr, (uint32_t*)dst + 1);
518+
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
519+
}
520+
// Send instant clock
521+
{
522+
auto copy_data = UserdataLoPacket(addr);
523+
APPEND_COMMAND_WRAPPER(cmdBuf, copy_data);
524+
}
525+
}
449526
};
450527

451528
} // namespace pm4_builder

src/pm4/sqtt_builder.h

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ class CmdBuffer;
3838
class CmdBuilder;
3939

4040
constexpr size_t ATT_CODEOBJ_OPCODE = 4;
41+
constexpr size_t ATT_TIMESTAMP_OPCODE = 5;
4142

42-
union att_decoder_codeobj_header_t {
43+
union att_decoder_packet_header_t {
4344
struct {
4445
unsigned int opcode : 8;
4546
unsigned int type : 4;
@@ -102,11 +103,14 @@ class XCC_Packet_Lock {
102103
// Thread traces status register indices to determine
103104
// status of thread trace run
104105

105-
struct TraceControl {
106-
uint32_t status;
107-
uint32_t cntr;
108-
uint32_t wptr;
109-
uint32_t _reserved;
106+
struct TraceControl
107+
{
108+
uint32_t status{0};
109+
uint32_t cntr{0};
110+
uint32_t wptr{0};
111+
uint32_t _reserved{0};
112+
uint64_t gpu_clock_cnt_start{0};
113+
uint64_t gpu_clock_cnt_end{0};
110114
};
111115

112116
// Encapsulates the various Api and structures that are used to enable
@@ -126,7 +130,9 @@ class SqttBuilder {
126130
virtual void End(CmdBuffer* cmd_buffer, TraceConfig* config) = 0;
127131
// Builds Pm4 command stream to program hardware registers that
128132
// inserts "data" into the SQTT buffer as USERDATA_2 (data_lo) and USERDATA_3 (data_hi)
129-
virtual hsa_status_t InsertMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0;
133+
virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data, unsigned channel) = 0;
134+
135+
virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) {};
130136

131137
// Returns TT_CONTROL_UTC_ERR_MASK
132138
virtual size_t GetUTCErrorMask() const = 0;
@@ -326,8 +332,6 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
326332
Primitives::sqtt_mode_on_value());
327333
base_addr += base_step;
328334
}
329-
// Reset the GRBM to broadcast mode
330-
SetGRBMToBroadcast(cmd_buffer);
331335
} else {
332336
SetGRBMToBroadcast(cmd_buffer);
333337
builder.BuildWritePConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_STATUS_ADDR, 0);
@@ -401,16 +405,46 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
401405

402406
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, header.u32All);
403407
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, 524801);
408+
409+
if (Primitives::GFXIP_LEVEL == 9 && config->enable_rt_timestamp)
410+
{
411+
for (size_t xcc = 0; xcc < GetXCCNumber(); xcc++)
412+
{
413+
bool some_se_enabled = false;
414+
for (int se = 0; se < se_number_xcc; se++) some_se_enabled |=config->target_cu_per_se.at(se + xcc*se_number_xcc) >= 0;
415+
if (!some_se_enabled) continue;
416+
417+
XCC_Packet_Lock<Builder> lock(builder, cmd_buffer, GetXCCNumber(), xcc);
418+
auto& control = reinterpret_cast<TraceControl*>(config->control_buffer_ptr)[xcc];
419+
InsertTimestampMarker(cmd_buffer, &control.gpu_clock_cnt_start);
420+
}
421+
}
404422
}
405423

406424
void End(CmdBuffer* cmd_buffer, TraceConfig* config) override {
407425
SetGRBMToBroadcast(cmd_buffer);
408426
// Issue a CSPartialFlush cmd including cache flush
409427
builder.BuildWriteWaitIdlePacket(cmd_buffer);
410428

411-
if (Primitives::GFXIP_LEVEL == 9) {
429+
if (Primitives::GFXIP_LEVEL == 9)
430+
{
412431
const uint32_t se_number_xcc = se_number_total / std::max(1u, GetXCCNumber());
413432

433+
if (config->enable_rt_timestamp)
434+
{
435+
for (size_t xcc = 0; xcc < GetXCCNumber(); xcc++)
436+
{
437+
bool some_se_enabled = false;
438+
for (int se = 0; se < se_number_xcc; se++) some_se_enabled |=config->target_cu_per_se.at(se + xcc*se_number_xcc) >= 0;
439+
if (!some_se_enabled) continue;
440+
441+
XCC_Packet_Lock<Builder> lock(builder, cmd_buffer, GetXCCNumber(), xcc);
442+
auto& control = reinterpret_cast<TraceControl*>(config->control_buffer_ptr)[xcc];
443+
InsertTimestampMarker(cmd_buffer, &control.gpu_clock_cnt_end);
444+
}
445+
builder.BuildWriteWaitIdlePacket(cmd_buffer);
446+
}
447+
414448
// Program the thread trace mode register to disable thread trace
415449
builder.BuildWriteUConfigRegPacket(cmd_buffer, Primitives::SQ_THREAD_TRACE_MODE_ADDR,
416450
Primitives::sqtt_mode_off_value());
@@ -527,9 +561,9 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
527561
return uint64_t(buffer_per_se) & ~((1 << Primitives::TT_BUFF_ALIGN_SHIFT) - 1);
528562
}
529563

530-
virtual hsa_status_t InsertMarker(CmdBuffer* cmd_buffer, uint32_t data,
564+
virtual hsa_status_t InsertCodeobjMarker(CmdBuffer* cmd_buffer, uint32_t data,
531565
unsigned channel) override {
532-
att_decoder_codeobj_header_t header{};
566+
att_decoder_packet_header_t header{};
533567
header.opcode = ATT_CODEOBJ_OPCODE;
534568
header.type = channel;
535569
header.reserved = 0;
@@ -540,6 +574,17 @@ class GpuSqttBuilder : public SqttBuilder, protected Primitives {
540574
builder.BuildWriteUConfigRegPacket(cmd_buffer, userdata_channel, data);
541575
return HSA_STATUS_SUCCESS;
542576
}
577+
578+
virtual void InsertTimestampMarker(CmdBuffer* cmd_buffer, uint64_t* addr) override
579+
{
580+
att_decoder_packet_header_t header{};
581+
header.opcode = ATT_TIMESTAMP_OPCODE;
582+
header.type = 0;
583+
header.reserved = 0;
584+
585+
SetGRBMToBroadcast(cmd_buffer);
586+
builder.BuildGPUClockPacket(cmd_buffer, addr, Primitives::SQ_THREAD_TRACE_USERDATA_3, header.u32All);
587+
}
543588

544589
template <typename T>
545590
void WriteConfigPacket(CmdBuffer* cmdbuf, const T& reg, uint32_t value) {

src/pm4/trace_config.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ struct TraceConfig {
6666
std::unordered_map<int, int> target_cu_per_se{};
6767
std::unordered_map<int, uint64_t> se_base_addresses{};
6868

69+
bool enable_rt_timestamp{false};
70+
6971
int GetTargetCU(int SE) const { return target_cu_per_se.at(SE); };
7072
uint64_t GetSEmask() const { return se_mask; };
7173
uint64_t GetSEBaseAddr(int SE) const { return se_base_addresses.at(SE); }

0 commit comments

Comments
 (0)