@@ -30,7 +30,9 @@ namespace aql_profile {
3030
3131class Mi300Factory : public Mi100Factory {
3232 public:
33- explicit Mi300Factory (const AgentInfo* agent_info) : Mi100Factory(agent_info) {
33+ explicit Mi300Factory (const AgentInfo* agent_info, gpu_id_t gpu_id = MI300_GPU_ID)
34+ : Mi100Factory(agent_info) {
35+ InitSpmBlockDelayTable (gpu_id);
3436 for (unsigned blockname_id = 0 ; blockname_id < AQLPROFILE_BLOCKS_NUMBER;
3537 ++blockname_id) {
3638 const GpuBlockInfo* base_table_ptr = Gfx9Factory::block_table_[blockname_id];
@@ -44,12 +46,14 @@ class Mi300Factory : public Mi100Factory {
4446 block_info = new GpuBlockInfo (*base_table_ptr);
4547 block_table_[blockname_id] = block_info;
4648 // overwrite block info for any update from gfx9 to mi300
49+ InitSpmBlockDelay (block_info);
4750 switch (block_info->id ) {
4851 case SqCounterBlockId:
4952 block_info->event_id_max = 373 ;
5053 break ;
5154 case TcpCounterBlockId:
5255 block_info->event_id_max = 84 ;
56+ assert (agent_info->se_num * block_info->instance_count == cu_block_delay_table_size);
5357 break ;
5458 case TccCounterBlockId:
5559 block_info->instance_count = 16 ;
@@ -82,8 +86,113 @@ class Mi300Factory : public Mi100Factory {
8286
8387 virtual int GetAccumLowID () const override { return 1 ; };
8488 virtual int GetAccumHiID () const override { return 184 ; };
89+ virtual uint32_t GetSpmSampleDelayMax () { return 0x27 ; };
90+
91+ private:
92+ void InitSpmBlockDelayTable (gpu_id_t gpu_id);
8593};
8694
95+ namespace gfx940 {
96+ static const uint32_t CpgBlockDelayValue[] = {0x21 };
97+ static const uint32_t CpcBlockDelayValue[] = {0x1f };
98+ static const uint32_t CpfBlockDelayValue[] = {0x23 };
99+ static const uint32_t GdsBlockDelayValue[] = {0x23 };
100+ static const uint32_t TccBlockDelayValue[] = {0x0f , 0x0f , 0x0c , 0x0e , 0x0e , 0x13 , 0x13 , 0x19 ,
101+ 0x13 , 0x13 , 0x12 , 0x13 , 0x13 , 0x17 , 0x17 , 0x1d };
102+ static const uint32_t TcaBlockDelayValue[] = {0x14 , 0x18 };
103+ static const uint32_t SxBlockDelayValue[] = {0x00 , 0x03 , 0x07 , 0x03 };
104+ static const uint32_t TaBlockDelayValue[] = {
105+ 0x17 , 0x15 , 0x13 , 0x11 , 0x0f , 0x0d , 0x0b , 0x09 , 0x07 , 0x05 , 0 , 0 , 0 , 0 , 0 , 0 , // se0
106+ 0x18 , 0x16 , 0x14 , 0x12 , 0x10 , 0x0e , 0x0c , 0x0a , 0x08 , 0x06 , 0 , 0 , 0 , 0 , 0 , 0 , // se1
107+ 0x1c , 0x1a , 0x18 , 0x16 , 0x14 , 0x12 , 0x10 , 0x0e , 0x0c , 0x0a , 0 , 0 , 0 , 0 , 0 , 0 , // se2
108+ 0x1a , 0x18 , 0x16 , 0x14 , 0x12 , 0x10 , 0x0e , 0x0c , 0x0a , 0x08 , 0 , 0 , 0 , 0 , 0 , 0 }; // se3
109+ static const uint32_t SpiBlockDelayValue[] = {0x10 , 0x19 , 0x1d , 0x13 };
110+ static const uint32_t SqBlockDelayValue[] = {0x10 , 0x1d , 0x21 , 0x12 };
111+ } // namespace gfx940
112+
113+ namespace gfx950 {
114+ static const uint32_t CpgBlockDelayValue[] = {0x33 };
115+ static const uint32_t CpcBlockDelayValue[] = {0x31 };
116+ static const uint32_t CpfBlockDelayValue[] = {0x33 };
117+ static const uint32_t GdsBlockDelayValue[] = {0x2f };
118+ static const uint32_t TccBlockDelayValue[] = {0x21 , 0x23 , 0x27 , 0x22 , 0x23 , 0x25 , 0x27 , 0x29 ,
119+ 0x24 , 0x25 , 0x29 , 0x25 , 0x27 , 0x27 , 0x29 , 0x2b };
120+ static const uint32_t TcaBlockDelayValue[] = {0x2b , 0x2d };
121+ static const uint32_t SxBlockDelayValue[] = {0x00 , 0x04 , 0x07 , 0x01 };
122+ static const uint32_t TaBlockDelayValue[] = {
123+ 0x29 , 0x25 , 0x21 , 0x1d , 0x19 , 0x15 , 0x11 , 0x0d , 0x09 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se0
124+ 0x2a , 0x26 , 0x22 , 0x1e , 0x1a , 0x16 , 0x12 , 0x0e , 0x0a , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se1
125+ 0x2b , 0x28 , 0x24 , 0x20 , 0x1c , 0x18 , 0x14 , 0x10 , 0x0c , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se2
126+ 0x2a , 0x26 , 0x22 , 0x1e , 0x1a , 0x16 , 0x12 , 0x0e , 0x0a , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; // se3
127+ static const uint32_t TdBlockDelayValue[] = {
128+ 0x29 , 0x25 , 0x21 , 0x1d , 0x19 , 0x15 , 0x11 , 0x0d , 0x09 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se0
129+ 0x2a , 0x26 , 0x22 , 0x1e , 0x1a , 0x16 , 0x12 , 0x0e , 0x0a , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se1
130+ 0x2b , 0x28 , 0x24 , 0x20 , 0x1c , 0x18 , 0x14 , 0x10 , 0x0c , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se2
131+ 0x2a , 0x26 , 0x22 , 0x1e , 0x1a , 0x16 , 0x12 , 0x0e , 0x0a , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; // se3
132+ static const uint32_t TcpBlockDelayValue[] = {
133+ 0x29 , 0x25 , 0x21 , 0x1d , 0x19 , 0x15 , 0x11 , 0x0d , 0x09 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se0
134+ 0x2a , 0x26 , 0x22 , 0x1e , 0x1a , 0x16 , 0x12 , 0x0e , 0x0a , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se1
135+ 0x2a , 0x28 , 0x24 , 0x20 , 0x1c , 0x18 , 0x14 , 0x10 , 0x0c , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // se2
136+ 0x2a , 0x27 , 0x23 , 0x1f , 0x1b , 0x17 , 0x13 , 0x0f , 0x0b , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; // se3
137+ static const uint32_t SpiBlockDelayValue[] = {0x25 , 0x2d , 0x2f , 0x2b };
138+ static const uint32_t SqBlockDelayValue[] = {0x25 , 0x2d , 0x2f , 0x2b };
139+ } // namespace gfx950
140+
141+ void Mi300Factory::InitSpmBlockDelayTable (gpu_id_t gpu_id) {
142+ const uint32_t ** p;
143+ if (gpu_id == MI300_GPU_ID) {
144+ cu_block_delay_table_size = sizeof (gfx940::TaBlockDelayValue) / sizeof (gfx940::TaBlockDelayValue[0 ]);
145+ // Global Blocks
146+ p = spm_block_delay_global;
147+ *p++ = gfx940::CpgBlockDelayValue; // CPG = 0
148+ *p++ = gfx940::CpcBlockDelayValue; // CPC = 1
149+ *p++ = gfx940::CpfBlockDelayValue; // CPF = 2
150+ *p++ = gfx940::GdsBlockDelayValue; // GDS = 3
151+ *p++ = gfx940::TccBlockDelayValue; // TCC = 4
152+ *p++ = gfx940::TcaBlockDelayValue; // TCA = 5
153+ *p++ = NULL ; // IA = 6
154+ *p++ = NULL ; // TCS = 7
155+ // SE Blocks
156+ p = spm_block_delay_se;
157+ *p++ = NULL ; // CB = 0
158+ *p++ = NULL ; // DB = 1
159+ *p++ = NULL ; // PA = 2
160+ *p++ = gfx940::SxBlockDelayValue; // SSX = 3
161+ *p++ = NULL ; // SC = 4
162+ *p++ = gfx940::TaBlockDelayValue; // TA = 5
163+ *p++ = gfx940::TaBlockDelayValue; // TD = 6 - Same as TA
164+ *p++ = gfx940::TaBlockDelayValue; // TCP = 7 - Same as TA
165+ *p++ = gfx940::SpiBlockDelayValue; // SPI = 8
166+ *p++ = gfx940::SqBlockDelayValue; // SQG = 9
167+ *p++ = NULL ; // VGT = 10
168+ } else if (gpu_id == MI350_GPU_ID) {
169+ cu_block_delay_table_size = sizeof (gfx950::TaBlockDelayValue) / sizeof (gfx950::TaBlockDelayValue[0 ]);
170+ // Global Blocks
171+ p = spm_block_delay_global;
172+ *p++ = gfx950::CpgBlockDelayValue; // CPG = 0
173+ *p++ = gfx950::CpcBlockDelayValue; // CPC = 1
174+ *p++ = gfx950::CpfBlockDelayValue; // CPF = 2
175+ *p++ = gfx950::GdsBlockDelayValue; // GDS = 3
176+ *p++ = gfx950::TccBlockDelayValue; // TCC = 4
177+ *p++ = gfx950::TcaBlockDelayValue; // TCA = 5
178+ *p++ = NULL ; // IA = 6
179+ *p++ = NULL ; // TCS = 7
180+ // SE Blocks
181+ p = spm_block_delay_se;
182+ *p++ = NULL ; // CB = 0
183+ *p++ = NULL ; // DB = 1
184+ *p++ = NULL ; // PA = 2
185+ *p++ = gfx950::SxBlockDelayValue; // SSX = 3
186+ *p++ = NULL ; // SC = 4
187+ *p++ = gfx950::TaBlockDelayValue; // TA = 5
188+ *p++ = gfx950::TdBlockDelayValue; // TD = 6
189+ *p++ = gfx950::TcpBlockDelayValue; // TCP = 7
190+ *p++ = gfx950::SpiBlockDelayValue; // SPI = 8
191+ *p++ = gfx950::SqBlockDelayValue; // SQG = 9
192+ *p++ = NULL ; // VGT = 10
193+ }
194+ }
195+
87196Pm4Factory* Pm4Factory::Mi300Create (const AgentInfo* agent_info) {
88197 auto p = new Mi300Factory (agent_info);
89198 if (p == NULL ) throw aql_profile_exc_msg (" Mi300Factory allocation failed" );
@@ -93,10 +202,11 @@ Pm4Factory* Pm4Factory::Mi300Create(const AgentInfo* agent_info) {
93202class Mi350Factory : public Mi300Factory {
94203 public:
95204 // MI350 is a copy of Mi300
96- explicit Mi350Factory (const AgentInfo* agent_info) : Mi300Factory(agent_info) {}
205+ explicit Mi350Factory (const AgentInfo* agent_info) : Mi300Factory(agent_info, MI350_GPU_ID ) {}
97206
98207 virtual int GetAccumLowID () const override { return 1 ; };
99208 virtual int GetAccumHiID () const override { return 200 ; };
209+ virtual uint32_t GetSpmSampleDelayMax () { return 0x33 ; };
100210};
101211
102212Pm4Factory* Pm4Factory::Mi350Create (const AgentInfo* agent_info) {
0 commit comments