Skip to content

Commit 02213d0

Browse files
committed
gcn: Add "s_nop"s for MI300
MI300 requires some additional s_nop to be added between some instructions. * As 'v_readlane' and 'v_writelane' have to be distinguished, the 'laneselect' attribute was changed from no/yes to no/read/write. * Add some missing 'laneselect' attributes for v_(read,write)lane. * Replace 'delayeduse' by 'flatmemaccess' which is more explicit, especially as some uses have to destinguished more details. (Alongside, one off-by-two delayeduse has been fixed.) On the other hand, RDNA 2, 3, and 3.5 do not require any added s_nop; thus, there is no need to walk the instructions for them to insert pointless S_NOP. (RDNA4 (not yet in GCC) requires it in a few cases.) gcc/ChangeLog: * config/gcn/gcn-opts.h (TARGET_NO_MANUAL_NOPS, TARGET_CDNA3_NOPS): Define. * config/gcn/gcn.md (define_attr "laneselect): Change 'yes' to 'read' and 'write'. (define_attr "flatmemaccess"): Add with values store, storex34, load, atomic, atomicwait, cmpswapx2, and no. Replacing ... (define_attr "delayeduse"): Remove. (define_attr "transop"): Add with values yes and no. (various insns): Update 'laneselect', add flatmemaccess and transop, remove delayeduse; fixing an issue for s_load_dwordx4 vs. flat_store_dwordx4 related to delayeduse (now: flatmemaccess). * config/gcn/gcn-valu.md: Update laneselect attribute and add flatmemaccess. * config/gcn/gcn.cc (gcn_cmpx_insn_p): New. (gcn_md_reorg): Update for MI300 to add additional s_nop. Skip s_nop-insertion part for RDNA{2,3}; add "VALU writes EXEC followed by VALU DPP" unconditionally for CDNA2/CDNA3/GCN5.
1 parent 94f896c commit 02213d0

File tree

4 files changed

+312
-137
lines changed

4 files changed

+312
-137
lines changed

gcc/config/gcn/gcn-opts.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,13 @@ enum hsaco_attr_type
8282
#define TARGET_DPP_FULL !TARGET_RDNA2_PLUS
8383
#define TARGET_DPP16 TARGET_RDNA2_PLUS
8484
#define TARGET_DPP8 TARGET_RDNA2_PLUS
85+
/* Device requires no manually inserted wait states; that's the
86+
case for RDNA 2, 3 and 3.5 (but not for RNDA 4). */
87+
#define TARGET_NO_MANUAL_NOPS TARGET_RDNA2_PLUS
8588
/* Device requires CDNA1-style manually inserted wait states for AVGPRs. */
8689
#define TARGET_AVGPR_CDNA1_NOPS TARGET_CDNA1
90+
/* Device requires CDNA3-style manually inserted wait states. */
91+
#define TARGET_CDNA3_NOPS TARGET_CDNA3
8792
/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
8893
for non-scalar memory operations. The string starts on purpose with a space.
8994
Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.

gcc/config/gcn/gcn-valu.md

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,7 @@
811811
[(set_attr "type" "vop3a")
812812
(set_attr "length" "8")
813813
(set_attr "exec" "none")
814-
(set_attr "laneselect" "yes")])
814+
(set_attr "laneselect" "write")])
815815

816816
; FIXME: 64bit operations really should be splitters, but I am not sure how
817817
; to represent vertical subregs.
@@ -828,7 +828,7 @@
828828
[(set_attr "type" "vmult")
829829
(set_attr "length" "16")
830830
(set_attr "exec" "none")
831-
(set_attr "laneselect" "yes")])
831+
(set_attr "laneselect" "write")])
832832

833833
(define_expand "vec_set<mode>"
834834
[(set (match_operand:V_MOV 0 "register_operand")
@@ -854,7 +854,7 @@
854854
[(set_attr "type" "vop3a")
855855
(set_attr "length" "8")
856856
(set_attr "exec" "none")
857-
(set_attr "laneselect" "yes")])
857+
(set_attr "laneselect" "write")])
858858

859859
(define_insn "*vec_set<mode>_1"
860860
[(set (match_operand:V_2REG 0 "register_operand" "=v")
@@ -871,7 +871,7 @@
871871
[(set_attr "type" "vmult")
872872
(set_attr "length" "16")
873873
(set_attr "exec" "none")
874-
(set_attr "laneselect" "yes")])
874+
(set_attr "laneselect" "write")])
875875

876876
(define_insn "vec_duplicate<mode><exec>"
877877
[(set (match_operand:V_1REG 0 "register_operand" "=v")
@@ -910,7 +910,7 @@
910910
[(set_attr "type" "vop3a")
911911
(set_attr "length" "8")
912912
(set_attr "exec" "none")
913-
(set_attr "laneselect" "yes")])
913+
(set_attr "laneselect" "read")])
914914

915915
(define_insn "vec_extract<mode><scalar_mode>"
916916
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
@@ -922,7 +922,7 @@
922922
[(set_attr "type" "vmult")
923923
(set_attr "length" "16")
924924
(set_attr "exec" "none")
925-
(set_attr "laneselect" "yes")])
925+
(set_attr "laneselect" "read")])
926926

927927
(define_insn "vec_extract<mode><scalar_mode>"
928928
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
@@ -934,7 +934,7 @@
934934
[(set_attr "type" "vmult")
935935
(set_attr "length" "32")
936936
(set_attr "exec" "none")
937-
(set_attr "laneselect" "yes")])
937+
(set_attr "laneselect" "read")])
938938

939939
(define_insn "vec_extract<V_1REG:mode><V_1REG_ALT:mode>_nop"
940940
[(set (match_operand:V_1REG_ALT 0 "register_operand" "=v,v")
@@ -1192,6 +1192,7 @@
11921192
return buf;
11931193
}
11941194
[(set_attr "type" "flat")
1195+
(set_attr "flatmemaccess" "load")
11951196
(set_attr "length" "12")
11961197
(set_attr "cdna" "*,cdna2,*,cdna2")
11971198
(set_attr "xnack" "off,off,on,on")])
@@ -1250,6 +1251,7 @@
12501251
return buf;
12511252
}
12521253
[(set_attr "type" "flat")
1254+
(set_attr "flatmemaccess" "load")
12531255
(set_attr "length" "12")
12541256
(set_attr "cdna" "*,cdna2,*,cdna2")
12551257
(set_attr "xnack" "off,off,on,on")])
@@ -1335,6 +1337,7 @@
13351337
return buf;
13361338
}
13371339
[(set_attr "type" "flat")
1340+
(set_attr "flatmemaccess" "store")
13381341
(set_attr "length" "12")
13391342
(set_attr "cdna" "*,cdna2")])
13401343

@@ -1390,6 +1393,7 @@
13901393
return buf;
13911394
}
13921395
[(set_attr "type" "flat")
1396+
(set_attr "flatmemaccess" "store")
13931397
(set_attr "length" "12")
13941398
(set_attr "cdna" "*,cdna2")])
13951399

@@ -3260,7 +3264,8 @@
32603264
"flag_unsafe_math_optimizations"
32613265
"v_sqrt%i0\t%0, %1"
32623266
[(set_attr "type" "vop1")
3263-
(set_attr "length" "8")])
3267+
(set_attr "length" "8")
3268+
(set_attr "transop" "yes")])
32643269

32653270
(define_insn "sqrt<mode>2"
32663271
[(set (match_operand:FP 0 "register_operand" "= v")
@@ -3269,7 +3274,8 @@
32693274
"flag_unsafe_math_optimizations"
32703275
"v_sqrt%i0\t%0, %1"
32713276
[(set_attr "type" "vop1")
3272-
(set_attr "length" "8")])
3277+
(set_attr "length" "8")
3278+
(set_attr "transop" "yes")])
32733279

32743280
; These FP unops have f64, f32 and f16 versions.
32753281
(define_int_iterator MATH_UNOP_1OR2REG
@@ -3559,7 +3565,8 @@
35593565
""
35603566
"v_rcp%i0\t%0, %1"
35613567
[(set_attr "type" "vop1")
3562-
(set_attr "length" "8")])
3568+
(set_attr "length" "8")
3569+
(set_attr "transop" "yes")])
35633570

35643571
;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the
35653572
;; one that matches op3 adjusted for best results in reciprocal division.

gcc/config/gcn/gcn.cc

Lines changed: 157 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5792,6 +5792,42 @@ gcn_libc_has_function (enum function_class fn_class,
57925792
/* }}} */
57935793
/* {{{ md_reorg pass. */
57945794

5795+
/* Identify V_CMPX from the "type" attribute;
5796+
note: this will also match 'v_cmp %E1 vcc'. */
5797+
5798+
static bool
5799+
gcn_cmpx_insn_p (attr_type type)
5800+
{
5801+
switch (type)
5802+
{
5803+
case TYPE_VOPC:
5804+
return true;
5805+
case TYPE_MUBUF:
5806+
case TYPE_MTBUF:
5807+
case TYPE_FLAT:
5808+
case TYPE_VOP3P_MAI:
5809+
case TYPE_UNKNOWN:
5810+
case TYPE_SOP1:
5811+
case TYPE_SOP2:
5812+
case TYPE_SOPK:
5813+
case TYPE_SOPC:
5814+
case TYPE_SOPP:
5815+
case TYPE_SMEM:
5816+
case TYPE_DS:
5817+
case TYPE_VOP2:
5818+
case TYPE_VOP1:
5819+
case TYPE_VOP3A:
5820+
case TYPE_VOP3B:
5821+
case TYPE_VOP_SDWA:
5822+
case TYPE_VOP_DPP:
5823+
case TYPE_MULT:
5824+
case TYPE_VMULT:
5825+
return false;
5826+
}
5827+
gcc_unreachable ();
5828+
return false;
5829+
}
5830+
57955831
/* Identify VMEM instructions from their "type" attribute. */
57965832

57975833
static bool
@@ -6152,12 +6188,22 @@ gcn_md_reorg (void)
61526188
detects the missed cases, and inserts the documented number of NOPs
61536189
required for correct execution. */
61546190

6191+
/* RDNA4 (not yet implemented) differs from RNDA 2/3/3.5 and requires some
6192+
s_nop, see 5.7 and esp. 5.7.2. in its ISA manual.
6193+
The assert here is a reminder to add those. */
6194+
STATIC_ASSERT (ISA_CDNA1 - ISA_RDNA3 == 1);
6195+
6196+
if (TARGET_NO_MANUAL_NOPS)
6197+
return;
6198+
61556199
const int max_waits = 5;
61566200
struct ilist
61576201
{
61586202
rtx_insn *insn;
61596203
attr_unit unit;
6160-
attr_delayeduse delayeduse;
6204+
attr_type type;
6205+
attr_flatmemaccess flatmemaccess;
6206+
bool delayeduse;
61616207
HARD_REG_SET writes;
61626208
HARD_REG_SET reads;
61636209
int age;
@@ -6178,7 +6224,29 @@ gcn_md_reorg (void)
61786224

61796225
attr_type itype = get_attr_type (insn);
61806226
attr_unit iunit = get_attr_unit (insn);
6181-
attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
6227+
attr_flatmemaccess iflatmemaccess = get_attr_flatmemaccess (insn);
6228+
bool delayeduse;
6229+
if (TARGET_CDNA3_NOPS)
6230+
switch (iflatmemaccess)
6231+
{
6232+
case FLATMEMACCESS_STORE:
6233+
case FLATMEMACCESS_STOREX34:
6234+
case FLATMEMACCESS_ATOMIC:
6235+
case FLATMEMACCESS_CMPSWAPX2:
6236+
delayeduse = true;
6237+
break;
6238+
case FLATMEMACCESS_LOAD:
6239+
case FLATMEMACCESS_ATOMICWAIT:
6240+
case FLATMEMACCESS_NO:
6241+
delayeduse = false;
6242+
break;
6243+
default:
6244+
gcc_unreachable ();
6245+
}
6246+
else
6247+
delayeduse = (iflatmemaccess == FLATMEMACCESS_CMPSWAPX2
6248+
|| iflatmemaccess == FLATMEMACCESS_STOREX34);
6249+
61826250
int ivccwait = get_attr_vccwait (insn);
61836251
HARD_REG_SET ireads, iwrites;
61846252
CLEAR_HARD_REG_SET (ireads);
@@ -6223,16 +6291,26 @@ gcn_md_reorg (void)
62236291
&& TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
62246292
nops_rqd = 5 - prev_insn->age;
62256293

6226-
/* VALU writes SGPR/VCC followed by v_{read,write}lane using
6227-
SGPR/VCC as lane select requires 4 wait states. */
6294+
/* VALU writes SGPR/VCC followed by
6295+
- v_{read,write}lane using SGPR/VCC as lane select requires
6296+
4 wait states
6297+
- [CDNA3] VALU reads SGPR as constant requires 1 wait state
6298+
- [CDNA3] VALU reads SGPR as carry-in requires no wait states */
62286299
if ((prev_insn->age + nops_rqd) < 4
62296300
&& prev_insn->unit == UNIT_VECTOR
6230-
&& get_attr_laneselect (insn) == LANESELECT_YES
6301+
&& get_attr_laneselect (insn) != LANESELECT_NO
62316302
&& (hard_reg_set_intersect_p
62326303
(depregs, reg_class_contents[(int) SGPR_REGS])
62336304
|| hard_reg_set_intersect_p
62346305
(depregs, reg_class_contents[(int) VCC_CONDITIONAL_REG])))
62356306
nops_rqd = 4 - prev_insn->age;
6307+
else if (TARGET_CDNA3_NOPS
6308+
&& (prev_insn->age + nops_rqd) < 1
6309+
&& prev_insn->unit == UNIT_VECTOR
6310+
&& iunit == UNIT_VECTOR
6311+
&& hard_reg_set_intersect_p
6312+
(depregs, reg_class_contents[(int) SGPR_REGS]))
6313+
nops_rqd = 1 - prev_insn->age;
62366314

62376315
/* VALU writes VGPR followed by VALU_DPP reading that VGPR
62386316
requires 2 wait states. */
@@ -6245,22 +6323,88 @@ gcn_md_reorg (void)
62456323
nops_rqd = 2 - prev_insn->age;
62466324
}
62476325

6326+
/* VALU writes EXEC followed by VALU DPP op requires 5 nop. */
6327+
if ((prev_insn->age + nops_rqd) < 5
6328+
&& itype == TYPE_VOP_DPP
6329+
&& prev_insn->unit == UNIT_VECTOR
6330+
&& TEST_HARD_REG_BIT (prev_insn->writes, EXECZ_REG))
6331+
nops_rqd = 5 - prev_insn->age;
6332+
62486333
/* Store that requires input registers are not overwritten by
6249-
following instruction. */
6250-
if ((prev_insn->age + nops_rqd) < 1
6251-
&& prev_insn->delayeduse == DELAYEDUSE_YES
6334+
following instruction.
6335+
For CDNA3, only, VALU writes require 2 not 1 nop.
6336+
CDNA3 additionally requires that 1 or 2 nop for global & scatch
6337+
store/atomic. */
6338+
if (TARGET_CDNA3_NOPS
6339+
&& (prev_insn->age + nops_rqd) < 2
6340+
&& prev_insn->delayeduse
6341+
&& iunit == UNIT_VECTOR
6342+
&& ((hard_reg_set_intersect_p
6343+
(prev_insn->reads, iwrites))))
6344+
nops_rqd = 2 - prev_insn->age;
6345+
else if ((prev_insn->age + nops_rqd) < 1
6346+
&& prev_insn->delayeduse
62526347
&& ((hard_reg_set_intersect_p
62536348
(prev_insn->reads, iwrites))))
62546349
nops_rqd = 1 - prev_insn->age;
62556350

6256-
/* Instruction that requires VCC is not written too close before
6257-
using it. */
6351+
/* Instruction (such as v_div_fmas) that requires VCC is not written
6352+
too close before using it */
62586353
if (prev_insn->age < ivccwait
62596354
&& (hard_reg_set_intersect_p
62606355
(prev_insn->writes,
62616356
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
62626357
nops_rqd = ivccwait - prev_insn->age;
62636358

6359+
/* CDNA3: v_cmpx followed by
6360+
- V_readlane, v_readfirstlane, v_writelane requires 4 wait states
6361+
- VALU reads EXEC as constant requires 2 wait states
6362+
- other VALU requires no wait state */
6363+
if (TARGET_CDNA3_NOPS
6364+
&& (prev_insn->age + nops_rqd) < 4
6365+
&& gcn_cmpx_insn_p (prev_insn->type)
6366+
&& get_attr_laneselect (insn) != LANESELECT_NO)
6367+
nops_rqd = 4 - prev_insn->age;
6368+
else if (TARGET_CDNA3_NOPS
6369+
&& (prev_insn->age + nops_rqd) < 2
6370+
&& iunit == UNIT_VECTOR
6371+
&& gcn_cmpx_insn_p (prev_insn->type)
6372+
&& TEST_HARD_REG_BIT (ireads, EXECZ_REG))
6373+
nops_rqd = 2 - prev_insn->age;
6374+
6375+
/* CDNA3: VALU writes VGPR followed by v_readlane vsrc0 reads VGPRn
6376+
requires 1 wait state. */
6377+
if (TARGET_CDNA3_NOPS
6378+
&& (prev_insn->age + nops_rqd) < 1
6379+
&& prev_insn->unit == UNIT_VECTOR
6380+
&& prev_insn->flatmemaccess != FLATMEMACCESS_LOAD
6381+
&& get_attr_laneselect (insn) == LANESELECT_READ
6382+
&& hard_reg_set_intersect_p
6383+
(depregs, reg_class_contents[(int) VGPR_REGS]))
6384+
nops_rqd = 1 - prev_insn->age;
6385+
6386+
/* CDNA3: VALU op which uses OPSEL or SDWA with changes the result's
6387+
bit position followed by VALU op consumes result of that op
6388+
requires 1 wait state.
6389+
FIXME: Handle OPSEL, once used. */
6390+
if (TARGET_CDNA3_NOPS
6391+
&& (prev_insn->age + nops_rqd) < 1
6392+
&& prev_insn->unit == UNIT_VECTOR
6393+
&& prev_insn->type == TYPE_VOP_SDWA
6394+
&& !hard_reg_set_empty_p (depregs))
6395+
nops_rqd = 1 - prev_insn->age;
6396+
6397+
/* CNDA3: VALU Trans Op (such as v_rcp_f64) followed by non-trans VALU
6398+
op consumes result of that op requires 1 wait state. */
6399+
if (TARGET_CDNA3_NOPS
6400+
&& (prev_insn->age + nops_rqd) < 1
6401+
&& prev_insn->unit == UNIT_VECTOR
6402+
&& iunit == UNIT_VECTOR
6403+
&& get_attr_transop (prev_insn->insn) == TRANSOP_YES
6404+
&& get_attr_transop (insn) == TRANSOP_NO
6405+
&& !hard_reg_set_empty_p (depregs))
6406+
nops_rqd = 1 - prev_insn->age;
6407+
62646408
/* CDNA1: write VGPR before v_accvgpr_write reads it. */
62656409
if (TARGET_AVGPR_CDNA1_NOPS
62666410
&& (prev_insn->age + nops_rqd) < 2
@@ -6316,7 +6460,9 @@ gcn_md_reorg (void)
63166460
/* Track the current instruction as a previous instruction. */
63176461
back[oldest].insn = insn;
63186462
back[oldest].unit = iunit;
6319-
back[oldest].delayeduse = idelayeduse;
6463+
back[oldest].type = itype;
6464+
back[oldest].flatmemaccess = iflatmemaccess;
6465+
back[oldest].delayeduse = delayeduse;
63206466
back[oldest].writes = iwrites;
63216467
back[oldest].reads = ireads;
63226468
back[oldest].age = 0;

0 commit comments

Comments
 (0)