@@ -5792,6 +5792,42 @@ gcn_libc_has_function (enum function_class fn_class,
5792
5792
/* }}} */
5793
5793
/* {{{ md_reorg pass. */
5794
5794
5795
+ /* Identify V_CMPX from the "type" attribute;
5796
+ note: this will also match 'v_cmp %E1 vcc'. */
5797
+
5798
+ static bool
5799
+ gcn_cmpx_insn_p (attr_type type)
5800
+ {
5801
+ switch (type)
5802
+ {
5803
+ case TYPE_VOPC:
5804
+ return true ;
5805
+ case TYPE_MUBUF:
5806
+ case TYPE_MTBUF:
5807
+ case TYPE_FLAT:
5808
+ case TYPE_VOP3P_MAI:
5809
+ case TYPE_UNKNOWN:
5810
+ case TYPE_SOP1:
5811
+ case TYPE_SOP2:
5812
+ case TYPE_SOPK:
5813
+ case TYPE_SOPC:
5814
+ case TYPE_SOPP:
5815
+ case TYPE_SMEM:
5816
+ case TYPE_DS:
5817
+ case TYPE_VOP2:
5818
+ case TYPE_VOP1:
5819
+ case TYPE_VOP3A:
5820
+ case TYPE_VOP3B:
5821
+ case TYPE_VOP_SDWA:
5822
+ case TYPE_VOP_DPP:
5823
+ case TYPE_MULT:
5824
+ case TYPE_VMULT:
5825
+ return false ;
5826
+ }
5827
+ gcc_unreachable ();
5828
+ return false ;
5829
+ }
5830
+
5795
5831
/* Identify VMEM instructions from their "type" attribute. */
5796
5832
5797
5833
static bool
@@ -6152,12 +6188,22 @@ gcn_md_reorg (void)
6152
6188
detects the missed cases, and inserts the documented number of NOPs
6153
6189
required for correct execution. */
6154
6190
6191
+ /* RDNA4 (not yet implemented) differs from RNDA 2/3/3.5 and requires some
6192
+ s_nop, see 5.7 and esp. 5.7.2. in its ISA manual.
6193
+ The assert here is a reminder to add those. */
6194
+ STATIC_ASSERT (ISA_CDNA1 - ISA_RDNA3 == 1 );
6195
+
6196
+ if (TARGET_NO_MANUAL_NOPS)
6197
+ return ;
6198
+
6155
6199
const int max_waits = 5 ;
6156
6200
struct ilist
6157
6201
{
6158
6202
rtx_insn *insn;
6159
6203
attr_unit unit;
6160
- attr_delayeduse delayeduse;
6204
+ attr_type type;
6205
+ attr_flatmemaccess flatmemaccess;
6206
+ bool delayeduse;
6161
6207
HARD_REG_SET writes;
6162
6208
HARD_REG_SET reads;
6163
6209
int age;
@@ -6178,7 +6224,29 @@ gcn_md_reorg (void)
6178
6224
6179
6225
attr_type itype = get_attr_type (insn);
6180
6226
attr_unit iunit = get_attr_unit (insn);
6181
- attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
6227
+ attr_flatmemaccess iflatmemaccess = get_attr_flatmemaccess (insn);
6228
+ bool delayeduse;
6229
+ if (TARGET_CDNA3_NOPS)
6230
+ switch (iflatmemaccess)
6231
+ {
6232
+ case FLATMEMACCESS_STORE:
6233
+ case FLATMEMACCESS_STOREX34:
6234
+ case FLATMEMACCESS_ATOMIC:
6235
+ case FLATMEMACCESS_CMPSWAPX2:
6236
+ delayeduse = true ;
6237
+ break ;
6238
+ case FLATMEMACCESS_LOAD:
6239
+ case FLATMEMACCESS_ATOMICWAIT:
6240
+ case FLATMEMACCESS_NO:
6241
+ delayeduse = false ;
6242
+ break ;
6243
+ default :
6244
+ gcc_unreachable ();
6245
+ }
6246
+ else
6247
+ delayeduse = (iflatmemaccess == FLATMEMACCESS_CMPSWAPX2
6248
+ || iflatmemaccess == FLATMEMACCESS_STOREX34);
6249
+
6182
6250
int ivccwait = get_attr_vccwait (insn);
6183
6251
HARD_REG_SET ireads, iwrites;
6184
6252
CLEAR_HARD_REG_SET (ireads);
@@ -6223,16 +6291,26 @@ gcn_md_reorg (void)
6223
6291
&& TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
6224
6292
nops_rqd = 5 - prev_insn->age ;
6225
6293
6226
- /* VALU writes SGPR/VCC followed by v_{read,write}lane using
6227
- SGPR/VCC as lane select requires 4 wait states. */
6294
+ /* VALU writes SGPR/VCC followed by
6295
+ - v_{read,write}lane using SGPR/VCC as lane select requires
6296
+ 4 wait states
6297
+ - [CDNA3] VALU reads SGPR as constant requires 1 wait state
6298
+ - [CDNA3] VALU reads SGPR as carry-in requires no wait states */
6228
6299
if ((prev_insn->age + nops_rqd) < 4
6229
6300
&& prev_insn->unit == UNIT_VECTOR
6230
- && get_attr_laneselect (insn) == LANESELECT_YES
6301
+ && get_attr_laneselect (insn) != LANESELECT_NO
6231
6302
&& (hard_reg_set_intersect_p
6232
6303
(depregs, reg_class_contents[(int ) SGPR_REGS])
6233
6304
|| hard_reg_set_intersect_p
6234
6305
(depregs, reg_class_contents[(int ) VCC_CONDITIONAL_REG])))
6235
6306
nops_rqd = 4 - prev_insn->age ;
6307
+ else if (TARGET_CDNA3_NOPS
6308
+ && (prev_insn->age + nops_rqd) < 1
6309
+ && prev_insn->unit == UNIT_VECTOR
6310
+ && iunit == UNIT_VECTOR
6311
+ && hard_reg_set_intersect_p
6312
+ (depregs, reg_class_contents[(int ) SGPR_REGS]))
6313
+ nops_rqd = 1 - prev_insn->age ;
6236
6314
6237
6315
/* VALU writes VGPR followed by VALU_DPP reading that VGPR
6238
6316
requires 2 wait states. */
@@ -6245,22 +6323,88 @@ gcn_md_reorg (void)
6245
6323
nops_rqd = 2 - prev_insn->age ;
6246
6324
}
6247
6325
6326
+ /* VALU writes EXEC followed by VALU DPP op requires 5 nop. */
6327
+ if ((prev_insn->age + nops_rqd) < 5
6328
+ && itype == TYPE_VOP_DPP
6329
+ && prev_insn->unit == UNIT_VECTOR
6330
+ && TEST_HARD_REG_BIT (prev_insn->writes , EXECZ_REG))
6331
+ nops_rqd = 5 - prev_insn->age ;
6332
+
6248
6333
/* Store that requires input registers are not overwritten by
6249
- following instruction. */
6250
- if ((prev_insn->age + nops_rqd) < 1
6251
- && prev_insn->delayeduse == DELAYEDUSE_YES
6334
+ following instruction.
6335
+ For CDNA3, only, VALU writes require 2 not 1 nop.
6336
+ CDNA3 additionally requires that 1 or 2 nop for global & scatch
6337
+ store/atomic. */
6338
+ if (TARGET_CDNA3_NOPS
6339
+ && (prev_insn->age + nops_rqd) < 2
6340
+ && prev_insn->delayeduse
6341
+ && iunit == UNIT_VECTOR
6342
+ && ((hard_reg_set_intersect_p
6343
+ (prev_insn->reads , iwrites))))
6344
+ nops_rqd = 2 - prev_insn->age ;
6345
+ else if ((prev_insn->age + nops_rqd) < 1
6346
+ && prev_insn->delayeduse
6252
6347
&& ((hard_reg_set_intersect_p
6253
6348
(prev_insn->reads , iwrites))))
6254
6349
nops_rqd = 1 - prev_insn->age ;
6255
6350
6256
- /* Instruction that requires VCC is not written too close before
6257
- using it. */
6351
+ /* Instruction (such as v_div_fmas) that requires VCC is not written
6352
+ too close before using it */
6258
6353
if (prev_insn->age < ivccwait
6259
6354
&& (hard_reg_set_intersect_p
6260
6355
(prev_insn->writes ,
6261
6356
reg_class_contents[(int )VCC_CONDITIONAL_REG])))
6262
6357
nops_rqd = ivccwait - prev_insn->age ;
6263
6358
6359
+ /* CDNA3: v_cmpx followed by
6360
+ - V_readlane, v_readfirstlane, v_writelane requires 4 wait states
6361
+ - VALU reads EXEC as constant requires 2 wait states
6362
+ - other VALU requires no wait state */
6363
+ if (TARGET_CDNA3_NOPS
6364
+ && (prev_insn->age + nops_rqd) < 4
6365
+ && gcn_cmpx_insn_p (prev_insn->type )
6366
+ && get_attr_laneselect (insn) != LANESELECT_NO)
6367
+ nops_rqd = 4 - prev_insn->age ;
6368
+ else if (TARGET_CDNA3_NOPS
6369
+ && (prev_insn->age + nops_rqd) < 2
6370
+ && iunit == UNIT_VECTOR
6371
+ && gcn_cmpx_insn_p (prev_insn->type )
6372
+ && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
6373
+ nops_rqd = 2 - prev_insn->age ;
6374
+
6375
+ /* CDNA3: VALU writes VGPR followed by v_readlane vsrc0 reads VGPRn
6376
+ requires 1 wait state. */
6377
+ if (TARGET_CDNA3_NOPS
6378
+ && (prev_insn->age + nops_rqd) < 1
6379
+ && prev_insn->unit == UNIT_VECTOR
6380
+ && prev_insn->flatmemaccess != FLATMEMACCESS_LOAD
6381
+ && get_attr_laneselect (insn) == LANESELECT_READ
6382
+ && hard_reg_set_intersect_p
6383
+ (depregs, reg_class_contents[(int ) VGPR_REGS]))
6384
+ nops_rqd = 1 - prev_insn->age ;
6385
+
6386
+ /* CDNA3: VALU op which uses OPSEL or SDWA with changes the result's
6387
+ bit position followed by VALU op consumes result of that op
6388
+ requires 1 wait state.
6389
+ FIXME: Handle OPSEL, once used. */
6390
+ if (TARGET_CDNA3_NOPS
6391
+ && (prev_insn->age + nops_rqd) < 1
6392
+ && prev_insn->unit == UNIT_VECTOR
6393
+ && prev_insn->type == TYPE_VOP_SDWA
6394
+ && !hard_reg_set_empty_p (depregs))
6395
+ nops_rqd = 1 - prev_insn->age ;
6396
+
6397
+ /* CNDA3: VALU Trans Op (such as v_rcp_f64) followed by non-trans VALU
6398
+ op consumes result of that op requires 1 wait state. */
6399
+ if (TARGET_CDNA3_NOPS
6400
+ && (prev_insn->age + nops_rqd) < 1
6401
+ && prev_insn->unit == UNIT_VECTOR
6402
+ && iunit == UNIT_VECTOR
6403
+ && get_attr_transop (prev_insn->insn ) == TRANSOP_YES
6404
+ && get_attr_transop (insn) == TRANSOP_NO
6405
+ && !hard_reg_set_empty_p (depregs))
6406
+ nops_rqd = 1 - prev_insn->age ;
6407
+
6264
6408
/* CDNA1: write VGPR before v_accvgpr_write reads it. */
6265
6409
if (TARGET_AVGPR_CDNA1_NOPS
6266
6410
&& (prev_insn->age + nops_rqd) < 2
@@ -6316,7 +6460,9 @@ gcn_md_reorg (void)
6316
6460
/* Track the current instruction as a previous instruction. */
6317
6461
back[oldest].insn = insn;
6318
6462
back[oldest].unit = iunit;
6319
- back[oldest].delayeduse = idelayeduse;
6463
+ back[oldest].type = itype;
6464
+ back[oldest].flatmemaccess = iflatmemaccess;
6465
+ back[oldest].delayeduse = delayeduse;
6320
6466
back[oldest].writes = iwrites;
6321
6467
back[oldest].reads = ireads;
6322
6468
back[oldest].age = 0 ;
0 commit comments