Skip to content

Commit 171a5f5

Browse files
bing-maBaraldi, GiovanniApoKalipse-V
authored
[aqlprofile] Enable SPM support for MI200/MI300 (#1768)
* [SPM] Enable legacy SPM aqlprofile API * [SPM] Enable SPM aqlprofile_v2 API * [NPI][SPM] Fix crash from ctrl test * Adding decode v1 (#189) Co-authored-by: Giovanni baraldi <gbaraldi@amd.com> * Fix various issues on MI200 1. RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1 support 2. ActiveCU patch for SPM delay table * [SPM] Fix wrong SPM counter values on MI3xx * Add mode and query blocks (#196) Co-authored-by: Giovanni baraldi <gbaraldi@amd.com> * [aqlprofile][spm] Use existing SpmBlockId enum info for delay table size * [aqlprofile][spm] Remove obsolete logic * Update projects/aqlprofile/src/core/include/aqlprofile-sdk/aql_profile_v2.h --------- Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com> Co-authored-by: Giovanni baraldi <gbaraldi@amd.com>
1 parent 9efd330 commit 171a5f5

File tree

18 files changed

+1621
-61
lines changed

18 files changed

+1621
-61
lines changed

projects/aqlprofile/gfxip/gfx9/gfx9_block_info.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ enum SpmGlobalBlockId {
9292
SPM_GLOBAL_BLOCK_NAME_TCA = 5,
9393
SPM_GLOBAL_BLOCK_NAME_IA = 6,
9494
SPM_GLOBAL_BLOCK_NAME_TCS = 7,
95+
SPM_GLOBAL_BLOCK_NAME_LAST = SPM_GLOBAL_BLOCK_NAME_TCS,
9596
};
9697

9798
enum SpmSeBlockId {
@@ -106,6 +107,7 @@ enum SpmSeBlockId {
106107
SPM_SE_BLOCK_NAME_SPI = 8,
107108
SPM_SE_BLOCK_NAME_SQG = 9,
108109
SPM_SE_BLOCK_NAME_VGT = 10,
110+
SPM_SE_BLOCK_NAME_LAST = SPM_SE_BLOCK_NAME_VGT,
109111
};
110112

111113
// Number of block instances

projects/aqlprofile/gfxip/gfx9/gfx9_primitives.h

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,8 @@ class gfx9_cntx_prim {
125125
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_RING_SIZE);
126126
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE__ADDR =
127127
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE);
128-
#if defined(regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1)
129128
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR =
130129
REG_32B_ADDR(GC, 0, regRLC_SPM_PERFMON_SEGMENT_SIZE_CORE1);
131-
#else
132-
static constexpr Register RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__ADDR = Register(0xDCAF);
133-
#endif
134130
static constexpr Register RLC_SPM_GLOBAL_MUXSEL_ADDR__ADDR =
135131
REG_32B_ADDR(GC, 0, regRLC_SPM_GLOBAL_MUXSEL_ADDR);
136132
static constexpr Register RLC_SPM_GLOBAL_MUXSEL_DATA__ADDR =
@@ -514,8 +510,10 @@ class gfx9_cntx_prim {
514510
}
515511

516512
static uint32_t rlc_spm_perfmon_cntl_value(const uint32_t& sampling_rate) {
513+
const uint32_t ring_mode = 3; // Stall and send Interrupt
517514
uint32_t rlc_spm_perfmon_cntl =
518-
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate);
515+
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_SAMPLE_INTERVAL, sampling_rate) |
516+
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_CNTL, PERFMON_RING_MODE, ring_mode);
519517
return rlc_spm_perfmon_cntl;
520518
}
521519
static uint32_t rlc_spm_perfmon_segment_size_value(const uint32_t& global_count,
@@ -535,16 +533,13 @@ class gfx9_cntx_prim {
535533
static uint32_t rlc_spm_perfmon_segment_size_core1_value(const uint32_t& se_count) {
536534
const uint32_t se_nlines = se_count;
537535
const uint32_t segment_size = 4 * se_nlines;
538-
uint32_t rlc_spm_perfmon_segment_size_core1{0};
539-
#if defined(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1__PERFMON_SEGMENT_SIZE_CORE1__SHIFT)
540-
rlc_spm_perfmon_segment_size_core1 =
536+
uint32_t rlc_spm_perfmon_segment_size_core1 =
541537
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, PERFMON_SEGMENT_SIZE_CORE1,
542538
segment_size) |
543539
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE4_NUM_LINE, se_nlines) |
544540
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE5_NUM_LINE, se_nlines) |
545541
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE6_NUM_LINE, se_nlines) |
546542
SET_REG_FIELD_BITS(RLC_SPM_PERFMON_SEGMENT_SIZE_CORE1, SE7_NUM_LINE, se_nlines);
547-
#endif
548543
return rlc_spm_perfmon_segment_size_core1;
549544
}
550545

projects/aqlprofile/src/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ set ( LIB_SRC
77
${LIB_DIR}/core/counters.cpp
88
${LIB_DIR}/core/threadtrace.cpp
99
${LIB_DIR}/core/spm_data.cpp
10+
${LIB_DIR}/core/spm_decode.cpp
11+
${LIB_DIR}/core/spm_v2.cpp
1012
${LIB_DIR}/core/populate_aql.cpp
1113
${LIB_DIR}/core/memorymanager.cpp
1214
${LIB_DIR}/core/pm4_factory.cpp

projects/aqlprofile/src/core/gfx908_factory.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,59 @@ namespace aql_profile {
3030

3131
const GpuBlockInfo* Mi100Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};
3232

33+
static const uint32_t CpgBlockDelayValue[] = {0x32};
34+
static const uint32_t CpcBlockDelayValue[] = {0x30};
35+
static const uint32_t CpfBlockDelayValue[] = {0x30};
36+
static const uint32_t GdsBlockDelayValue[] = {0x34};
37+
static const uint32_t TccBlockDelayValue[] = {
38+
0x08, 0x0c, 0x0c, 0x0e, 0x14, 0x10, 0x1e, 0x22, 0x0a, 0x0e, 0x0c, 0x10, 0x14, 0x12, 0x22, 0x28,
39+
0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x28, 0x2e, 0x14, 0x16, 0x18, 0x18, 0x20, 0x1c, 0x2a, 0x30};
40+
static const uint32_t TcaBlockDelayValue[] = {0x18, 0x1c, 0x24, 0x24};
41+
42+
static const uint32_t SxBlockDelayValue[] = {0x00, 0x01, 0x0a, 0x12, 0x00, 0x02, 0x0a, 0x12};
43+
static const uint32_t TaBlockDelayValue[] = {
44+
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
45+
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
46+
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02,
47+
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08,
48+
0x19, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
49+
0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
50+
0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04,
51+
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08};
52+
static const uint32_t SpiBlockDelayValue[] = {0x11, 0x1b, 0x20, 0x28, 0x15, 0x1b, 0x22, 0x2a};
53+
static const uint32_t SqBlockDelayValue[] = {0x12, 0x1c, 0x20, 0x2c, 0x16, 0x1c, 0x24, 0x2c};
54+
55+
void Mi100Factory::InitSpmBlockDelayTable() {
56+
cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
57+
const uint32_t** p;
58+
// Global Blocks
59+
p = spm_block_delay_global;
60+
*p++ = CpgBlockDelayValue; // CPG = 0
61+
*p++ = CpcBlockDelayValue; // CPC = 1
62+
*p++ = CpfBlockDelayValue; // CPF = 2
63+
*p++ = GdsBlockDelayValue; // GDS = 3
64+
*p++ = TccBlockDelayValue; // TCC = 4
65+
*p++ = TcaBlockDelayValue; // TCA = 5
66+
*p++ = NULL; // IA = 6
67+
*p++ = NULL; // TCS = 7
68+
// SE Blocks
69+
p = spm_block_delay_se;
70+
*p++ = NULL; // CB = 0
71+
*p++ = NULL; // DB = 1
72+
*p++ = NULL; // PA = 2
73+
*p++ = SxBlockDelayValue; // SSX = 3
74+
*p++ = NULL; // SC = 4
75+
*p++ = TaBlockDelayValue; // TA = 5
76+
*p++ = TaBlockDelayValue; // TD = 6 - Same as TA
77+
*p++ = TaBlockDelayValue; // TCP = 7 - Same as TA
78+
*p++ = SpiBlockDelayValue; // SPI = 8
79+
*p++ = SqBlockDelayValue; // SQG = 9
80+
*p++ = NULL; // VGT = 10
81+
}
82+
3383
Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
3484
: Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
85+
InitSpmBlockDelayTable();
3586
for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
3687
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
3788
if (base_table_ptr == NULL) continue;
@@ -43,12 +94,14 @@ Mi100Factory::Mi100Factory(const AgentInfo* agent_info)
4394
block_table_[i] = block_info;
4495

4596
// overwrite block info for any update from gfx9 to mi100
97+
InitSpmBlockDelay(block_info);
4698
switch (block_info->id) {
4799
case SqCounterBlockId:
48100
block_info->event_id_max = 303;
49101
break;
50102
case TcpCounterBlockId:
51103
block_info->event_id_max = 87;
104+
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
52105
break;
53106
case TccCounterBlockId:
54107
block_info->instance_count = 32;

projects/aqlprofile/src/core/gfx90a_factory.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,69 @@ class Mi200Factory : public Gfx9Factory {
3535

3636
virtual int GetAccumLowID() const override { return 1; };
3737
virtual int GetAccumHiID() const override { return 185; };
38+
virtual uint32_t GetSpmSampleDelayMax() { return 0x3e; };
39+
40+
private:
41+
void InitSpmBlockDelayTable();
3842

3943
protected:
4044
static const GpuBlockInfo* block_table_[AQLPROFILE_BLOCKS_NUMBER];
4145
};
4246

4347
const GpuBlockInfo* Mi200Factory::block_table_[AQLPROFILE_BLOCKS_NUMBER] = {};
4448

49+
static const uint32_t CpgBlockDelayValue[] = {0x38};
50+
static const uint32_t CpcBlockDelayValue[] = {0x36};
51+
static const uint32_t CpfBlockDelayValue[] = {0x3a};
52+
static const uint32_t GdsBlockDelayValue[] = {0x3a};
53+
static const uint32_t TccBlockDelayValue[] = {
54+
0x11, 0x1b, 0x11, 0x23, 0x14, 0x1a, 0x13, 0x29, 0x15, 0x20, 0x12, 0x29, 0x19, 0x1c, 0x15, 0x2c,
55+
0x1d, 0x26, 0x1a, 0x2d, 0x20, 0x23, 0x1d, 0x34, 0x20, 0x2a, 0x1e, 0x32, 0x24, 0x28, 0x22, 0x38};
56+
static const uint32_t TcaBlockDelayValue[] = {0x20, 0x20, 0x28, 0x2c};
57+
static const uint32_t SxBlockDelayValue[] = {0x02, 0x08, 0x0c, 0x16, 0x00, 0x0c, 0x11, 0x1e};
58+
static const uint32_t TaBlockDelayValue[] = {
59+
0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x04, 0x02, 0x00, 0, 0, // se0
60+
0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0, 0, // se1
61+
0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, // se2
62+
0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0, 0, // se3
63+
0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0, 0, // se4
64+
0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0, 0, // se5
65+
0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, // se6
66+
0x30, 0x2e, 0x2c, 0x2a, 0x28, 0x26, 0x24, 0x22, 0x20, 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0, 0}; // se7
67+
static const uint32_t SpiBlockDelayValue[] = {0x20, 0x20, 0x26, 0x2e, 0x26, 0x26, 0x27, 0x32};
68+
static const uint32_t SqBlockDelayValue[] = {0x1a, 0x22, 0x28, 0x32, 0x1f, 0x24, 0x2c, 0x34};
69+
70+
void Mi200Factory::InitSpmBlockDelayTable() {
71+
cu_block_delay_table_size = sizeof(TaBlockDelayValue) / sizeof(TaBlockDelayValue[0]);
72+
const uint32_t** p;
73+
// Global Blocks
74+
p = spm_block_delay_global;
75+
*p++ = CpgBlockDelayValue; // CPG = 0
76+
*p++ = CpcBlockDelayValue; // CPC = 1
77+
*p++ = CpfBlockDelayValue; // CPF = 2
78+
*p++ = GdsBlockDelayValue; // GDS = 3
79+
*p++ = TccBlockDelayValue; // TCC = 4
80+
*p++ = TcaBlockDelayValue; // TCA = 5
81+
*p++ = NULL; // IA = 6
82+
*p++ = NULL; // TCS = 7
83+
// SE Blocks
84+
p = spm_block_delay_se;
85+
*p++ = NULL; // CB = 0
86+
*p++ = NULL; // DB = 1
87+
*p++ = NULL; // PA = 2
88+
*p++ = SxBlockDelayValue; // SSX = 3
89+
*p++ = NULL; // SC = 4
90+
*p++ = TaBlockDelayValue; // TA = 5
91+
*p++ = TaBlockDelayValue; // TD = 6 - Same as TA
92+
*p++ = TaBlockDelayValue; // TCP = 7 - Same as TA
93+
*p++ = SpiBlockDelayValue; // SPI = 8
94+
*p++ = SqBlockDelayValue; // SQG = 9
95+
*p++ = NULL; // VGT = 10
96+
}
97+
4598
Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
4699
: Gfx9Factory(block_table_, sizeof(block_table_), agent_info) {
100+
InitSpmBlockDelayTable();
47101
for (unsigned i = 0; i < AQLPROFILE_BLOCKS_NUMBER; ++i) {
48102
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[i];
49103
if (base_table_ptr == NULL) continue;
@@ -54,12 +108,14 @@ Mi200Factory::Mi200Factory(const AgentInfo* agent_info)
54108
block_info = new GpuBlockInfo(*base_table_ptr);
55109
block_table_[i] = block_info;
56110
// overwrite block info for any update from gfx9 to mi100
111+
InitSpmBlockDelay(block_info);
57112
switch (block_info->id) {
58113
case SqCounterBlockId:
59114
block_info->event_id_max = 303;
60115
break;
61116
case TcpCounterBlockId:
62117
block_info->event_id_max = 87;
118+
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
63119
break;
64120
case TccCounterBlockId:
65121
block_info->instance_count = 32;

projects/aqlprofile/src/core/gfx940_factory.cpp

Lines changed: 112 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ namespace aql_profile {
3030

3131
class Mi300Factory : public Mi100Factory {
3232
public:
33-
explicit Mi300Factory(const AgentInfo* agent_info) : Mi100Factory(agent_info) {
33+
explicit Mi300Factory(const AgentInfo* agent_info, gpu_id_t gpu_id = MI300_GPU_ID)
34+
: Mi100Factory(agent_info) {
35+
InitSpmBlockDelayTable(gpu_id);
3436
for (unsigned blockname_id = 0; blockname_id < AQLPROFILE_BLOCKS_NUMBER;
3537
++blockname_id) {
3638
const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[blockname_id];
@@ -44,12 +46,14 @@ class Mi300Factory : public Mi100Factory {
4446
block_info = new GpuBlockInfo(*base_table_ptr);
4547
block_table_[blockname_id] = block_info;
4648
// overwrite block info for any update from gfx9 to mi300
49+
InitSpmBlockDelay(block_info);
4750
switch (block_info->id) {
4851
case SqCounterBlockId:
4952
block_info->event_id_max = 373;
5053
break;
5154
case TcpCounterBlockId:
5255
block_info->event_id_max = 84;
56+
assert(agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
5357
break;
5458
case TccCounterBlockId:
5559
block_info->instance_count = 16;
@@ -82,8 +86,113 @@ class Mi300Factory : public Mi100Factory {
8286

8387
virtual int GetAccumLowID() const override { return 1; };
8488
virtual int GetAccumHiID() const override { return 184; };
89+
virtual uint32_t GetSpmSampleDelayMax() { return 0x27; };
90+
91+
private:
92+
void InitSpmBlockDelayTable(gpu_id_t gpu_id);
8593
};
8694

95+
namespace gfx940 {
96+
static const uint32_t CpgBlockDelayValue[] = {0x21};
97+
static const uint32_t CpcBlockDelayValue[] = {0x1f};
98+
static const uint32_t CpfBlockDelayValue[] = {0x23};
99+
static const uint32_t GdsBlockDelayValue[] = {0x23};
100+
static const uint32_t TccBlockDelayValue[] = {0x0f, 0x0f, 0x0c, 0x0e, 0x0e, 0x13, 0x13, 0x19,
101+
0x13, 0x13, 0x12, 0x13, 0x13, 0x17, 0x17, 0x1d};
102+
static const uint32_t TcaBlockDelayValue[] = {0x14, 0x18};
103+
static const uint32_t SxBlockDelayValue[] = {0x00, 0x03, 0x07, 0x03};
104+
static const uint32_t TaBlockDelayValue[] = {
105+
0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0, 0, 0, 0, 0, 0, // se0
106+
0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0, 0, 0, 0, 0, 0, // se1
107+
0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0, 0, 0, 0, 0, 0, // se2
108+
0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0, 0, 0, 0, 0, 0}; // se3
109+
static const uint32_t SpiBlockDelayValue[] = {0x10, 0x19, 0x1d, 0x13};
110+
static const uint32_t SqBlockDelayValue[] = {0x10, 0x1d, 0x21, 0x12};
111+
} // namespace gfx940
112+
113+
namespace gfx950 {
114+
static const uint32_t CpgBlockDelayValue[] = {0x33};
115+
static const uint32_t CpcBlockDelayValue[] = {0x31};
116+
static const uint32_t CpfBlockDelayValue[] = {0x33};
117+
static const uint32_t GdsBlockDelayValue[] = {0x2f};
118+
static const uint32_t TccBlockDelayValue[] = {0x21, 0x23, 0x27, 0x22, 0x23, 0x25, 0x27, 0x29,
119+
0x24, 0x25, 0x29, 0x25, 0x27, 0x27, 0x29, 0x2b};
120+
static const uint32_t TcaBlockDelayValue[] = {0x2b, 0x2d};
121+
static const uint32_t SxBlockDelayValue[] = {0x00, 0x04, 0x07, 0x01};
122+
static const uint32_t TaBlockDelayValue[] = {
123+
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
124+
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
125+
0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
126+
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
127+
static const uint32_t TdBlockDelayValue[] = {
128+
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
129+
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
130+
0x2b, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
131+
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0}; // se3
132+
static const uint32_t TcpBlockDelayValue[] = {
133+
0x29, 0x25, 0x21, 0x1d, 0x19, 0x15, 0x11, 0x0d, 0x09, 0, 0, 0, 0, 0, 0, 0, // se0
134+
0x2a, 0x26, 0x22, 0x1e, 0x1a, 0x16, 0x12, 0x0e, 0x0a, 0, 0, 0, 0, 0, 0, 0, // se1
135+
0x2a, 0x28, 0x24, 0x20, 0x1c, 0x18, 0x14, 0x10, 0x0c, 0, 0, 0, 0, 0, 0, 0, // se2
136+
0x2a, 0x27, 0x23, 0x1f, 0x1b, 0x17, 0x13, 0x0f, 0x0b, 0, 0, 0, 0, 0, 0, 0}; // se3
137+
static const uint32_t SpiBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
138+
static const uint32_t SqBlockDelayValue[] = {0x25, 0x2d, 0x2f, 0x2b};
139+
} // namespace gfx950
140+
141+
void Mi300Factory::InitSpmBlockDelayTable(gpu_id_t gpu_id) {
142+
const uint32_t** p;
143+
if (gpu_id == MI300_GPU_ID) {
144+
cu_block_delay_table_size = sizeof(gfx940::TaBlockDelayValue) / sizeof(gfx940::TaBlockDelayValue[0]);
145+
// Global Blocks
146+
p = spm_block_delay_global;
147+
*p++ = gfx940::CpgBlockDelayValue; // CPG = 0
148+
*p++ = gfx940::CpcBlockDelayValue; // CPC = 1
149+
*p++ = gfx940::CpfBlockDelayValue; // CPF = 2
150+
*p++ = gfx940::GdsBlockDelayValue; // GDS = 3
151+
*p++ = gfx940::TccBlockDelayValue; // TCC = 4
152+
*p++ = gfx940::TcaBlockDelayValue; // TCA = 5
153+
*p++ = NULL; // IA = 6
154+
*p++ = NULL; // TCS = 7
155+
// SE Blocks
156+
p = spm_block_delay_se;
157+
*p++ = NULL; // CB = 0
158+
*p++ = NULL; // DB = 1
159+
*p++ = NULL; // PA = 2
160+
*p++ = gfx940::SxBlockDelayValue; // SSX = 3
161+
*p++ = NULL; // SC = 4
162+
*p++ = gfx940::TaBlockDelayValue; // TA = 5
163+
*p++ = gfx940::TaBlockDelayValue; // TD = 6 - Same as TA
164+
*p++ = gfx940::TaBlockDelayValue; // TCP = 7 - Same as TA
165+
*p++ = gfx940::SpiBlockDelayValue; // SPI = 8
166+
*p++ = gfx940::SqBlockDelayValue; // SQG = 9
167+
*p++ = NULL; // VGT = 10
168+
} else if (gpu_id == MI350_GPU_ID) {
169+
cu_block_delay_table_size = sizeof(gfx950::TaBlockDelayValue) / sizeof(gfx950::TaBlockDelayValue[0]);
170+
// Global Blocks
171+
p = spm_block_delay_global;
172+
*p++ = gfx950::CpgBlockDelayValue; // CPG = 0
173+
*p++ = gfx950::CpcBlockDelayValue; // CPC = 1
174+
*p++ = gfx950::CpfBlockDelayValue; // CPF = 2
175+
*p++ = gfx950::GdsBlockDelayValue; // GDS = 3
176+
*p++ = gfx950::TccBlockDelayValue; // TCC = 4
177+
*p++ = gfx950::TcaBlockDelayValue; // TCA = 5
178+
*p++ = NULL; // IA = 6
179+
*p++ = NULL; // TCS = 7
180+
// SE Blocks
181+
p = spm_block_delay_se;
182+
*p++ = NULL; // CB = 0
183+
*p++ = NULL; // DB = 1
184+
*p++ = NULL; // PA = 2
185+
*p++ = gfx950::SxBlockDelayValue; // SSX = 3
186+
*p++ = NULL; // SC = 4
187+
*p++ = gfx950::TaBlockDelayValue; // TA = 5
188+
*p++ = gfx950::TdBlockDelayValue; // TD = 6
189+
*p++ = gfx950::TcpBlockDelayValue; // TCP = 7
190+
*p++ = gfx950::SpiBlockDelayValue; // SPI = 8
191+
*p++ = gfx950::SqBlockDelayValue; // SQG = 9
192+
*p++ = NULL; // VGT = 10
193+
}
194+
}
195+
87196
Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
88197
auto p = new Mi300Factory(agent_info);
89198
if (p == NULL) throw aql_profile_exc_msg("Mi300Factory allocation failed");
@@ -93,10 +202,11 @@ Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
93202
class Mi350Factory : public Mi300Factory {
94203
public:
95204
// MI350 is a copy of Mi300
96-
explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info) {}
205+
explicit Mi350Factory(const AgentInfo* agent_info) : Mi300Factory(agent_info, MI350_GPU_ID) {}
97206

98207
virtual int GetAccumLowID() const override { return 1; };
99208
virtual int GetAccumHiID() const override { return 200; };
209+
virtual uint32_t GetSpmSampleDelayMax() { return 0x33; };
100210
};
101211

102212
Pm4Factory* Pm4Factory::Mi350Create(const AgentInfo* agent_info) {

0 commit comments

Comments
 (0)