Skip to content

Commit 5a80054

Browse files
committed
fix llvm.amdgcn.iglp.opt.exp.large.mir
1 parent 5c66d77 commit 5a80054

File tree

1 file changed

+78
-85
lines changed

1 file changed

+78
-85
lines changed

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir

Lines changed: 78 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -464,16 +464,10 @@
464464
; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
465465
; GCN-NEXT: s_waitcnt vmcnt(0)
466466
; GCN-NEXT: buffer_inv sc0 sc1
467-
; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
468467
; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134
469-
; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
470-
; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
471468
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
472469
; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134
473-
; GCN-NEXT: v_exp_f32_e32 v163, v57
474-
; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
475470
; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134
476-
; GCN-NEXT: v_exp_f32_e32 v164, v57
477471
; GCN-NEXT: v_exp_f32_e32 v49, v48
478472
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64
479473
; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134
@@ -495,35 +489,35 @@
495489
; GCN-NEXT: ds_read_b128 v[140:143], v139
496490
; GCN-NEXT: s_waitcnt lgkmcnt(0)
497491
; GCN-NEXT: buffer_inv sc0 sc1
492+
; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
493+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
494+
; GCN-NEXT: buffer_inv sc0 sc1
498495
; GCN-NEXT: v_exp_f32_e32 v54, v48
499496
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70
500497
; GCN-NEXT: v_exp_f32_e32 v55, v48
501498
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71
502-
; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
503-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
504-
; GCN-NEXT: buffer_inv sc0 sc1
505499
; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134
506500
; GCN-NEXT: v_exp_f32_e32 v56, v48
507501
; GCN-NEXT: v_sub_f32_e32 v48, v65, v134
502+
; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
503+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
504+
; GCN-NEXT: buffer_inv sc0 sc1
508505
; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49
509506
; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50
510507
; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51
508+
; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
511509
; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52
512510
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
513-
; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
514-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
515-
; GCN-NEXT: buffer_inv sc0 sc1
516511
; GCN-NEXT: v_exp_f32_e32 v48, v48
512+
; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
517513
; GCN-NEXT: v_pack_b32_f16 v161, v68, v58
518514
; GCN-NEXT: v_pack_b32_f16 v160, v64, v67
519515
; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66
520516
; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
521517
; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728
522518
; GCN-NEXT: s_waitcnt lgkmcnt(0)
523519
; GCN-NEXT: buffer_inv sc0 sc1
524-
; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
525-
; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
526-
; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
520+
; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
527521
; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
528522
; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
529523
; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
@@ -532,88 +526,84 @@
532526
; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
533527
; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
534528
; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
535-
; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
536-
; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
529+
; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
537530
; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
538-
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
539-
; GCN-NEXT: v_mul_f32_e64 v82, v82, v48
540-
; GCN-NEXT: v_mul_f32_e64 v83, v83, v48
541-
; GCN-NEXT: v_mul_f32_e64 v84, v84, v48
542-
; GCN-NEXT: v_mul_f32_e64 v85, v85, v48
543-
; GCN-NEXT: v_mul_f32_e64 v86, v86, v48
544-
; GCN-NEXT: v_mul_f32_e64 v87, v87, v48
531+
; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
532+
; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
533+
; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
545534
; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
546535
; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
547536
; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
548537
; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
549-
; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
538+
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
550539
; GCN-NEXT: v_exp_f32_e32 v58, v58
551-
; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
540+
; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
541+
; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
542+
; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
543+
; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
544+
; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
545+
; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
552546
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
553-
; GCN-NEXT: v_mul_f32_e64 v98, v98, v48
554-
; GCN-NEXT: v_mul_f32_e64 v99, v99, v48
555-
; GCN-NEXT: v_mul_f32_e64 v100, v100, v48
556-
; GCN-NEXT: v_mul_f32_e64 v101, v101, v48
557-
; GCN-NEXT: v_mul_f32_e64 v102, v102, v48
558-
; GCN-NEXT: v_mul_f32_e64 v103, v103, v48
547+
; GCN-NEXT: v_exp_f32_e32 v163, v57
548+
; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
549+
; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
550+
; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
551+
; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
552+
; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
553+
; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
554+
; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
559555
; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
560556
; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
561557
; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
562558
; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
563-
; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
564-
; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
565-
; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
566-
; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
567-
; GCN-NEXT: v_exp_f32_e32 v59, v57
568-
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
569-
; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
570-
; GCN-NEXT: v_mul_f32_e64 v112, v112, v48
571-
; GCN-NEXT: v_mul_f32_e64 v113, v113, v48
572-
; GCN-NEXT: v_mul_f32_e64 v114, v114, v48
573-
; GCN-NEXT: v_mul_f32_e64 v115, v115, v48
559+
; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
574560
; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
561+
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
562+
; GCN-NEXT: v_exp_f32_e32 v164, v57
563+
; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
575564
; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
576565
; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
577566
; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
578567
; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
579568
; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
569+
; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
570+
; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
580571
; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134
581-
; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
582572
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
583573
; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134
574+
; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
575+
; GCN-NEXT: v_exp_f32_e32 v59, v57
584576
; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60
585577
; GCN-NEXT: ; implicit-def: $vgpr57
586578
; GCN-NEXT: ds_read_b128 v[60:63], v57
587579
; GCN-NEXT: s_waitcnt lgkmcnt(0)
588580
; GCN-NEXT: buffer_inv sc0 sc1
589-
; GCN-NEXT: v_exp_f32_e32 v160, v149
590581
; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134
591582
; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148
592-
; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
593583
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
584+
; GCN-NEXT: v_exp_f32_e32 v160, v149
585+
; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
594586
; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134
595587
; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576
596588
; GCN-NEXT: s_waitcnt lgkmcnt(0)
597589
; GCN-NEXT: buffer_inv sc0 sc1
598590
; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134
599591
; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134
600592
; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134
601-
; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
602-
; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
603593
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
604594
; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162
605595
; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163
606596
; GCN-NEXT: v_exp_f32_e32 v162, v146
607597
; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164
608-
; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
598+
; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
609599
; GCN-NEXT: v_pack_b32_f16 v148, v153, v147
610-
; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
600+
; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
611601
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
612602
; GCN-NEXT: v_exp_f32_e32 v151, v33
613603
; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59
614604
; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134
615-
; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
616-
; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
605+
; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
606+
; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
617607
; GCN-NEXT: v_pack_b32_f16 v149, v146, v33
618608
; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152
619609
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
@@ -622,6 +612,8 @@
622612
; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134
623613
; GCN-NEXT: v_perm_b32 v36, v158, v156, s5
624614
; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160
615+
; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
616+
; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
625617
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
626618
; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32
627619
; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152
@@ -795,12 +787,14 @@
795787
; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158
796788
; GCN-NEXT: v_perm_b32 v21, v148, v144, s5
797789
; GCN-NEXT: v_perm_b32 v37, v148, v144, s8
798-
; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
799790
; GCN-NEXT: ;;#ASMSTART
800791
; GCN-NEXT: s_waitcnt vmcnt(8)
801792
; GCN-NEXT: ;;#ASMEND
802793
; GCN-NEXT: buffer_wbl2 sc0 sc1
803794
; GCN-NEXT: ds_write_b64 v135, v[20:21]
795+
; GCN-NEXT: buffer_wbl2 sc0 sc1
796+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
797+
; GCN-NEXT: ds_write_b64 v136, v[36:37]
804798
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
805799
; GCN-NEXT: v_perm_b32 v16, v141, v131, s5
806800
; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134
@@ -810,33 +804,31 @@
810804
; GCN-NEXT: v_perm_b32 v17, v149, v145, s5
811805
; GCN-NEXT: buffer_wbl2 sc0 sc1
812806
; GCN-NEXT: s_waitcnt lgkmcnt(0)
813-
; GCN-NEXT: ds_write_b64 v136, v[36:37]
807+
; GCN-NEXT: ds_write_b64 v137, v[16:17]
814808
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
815809
; GCN-NEXT: v_pack_b32_f16 v33, v45, v22
816810
; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60
817811
; GCN-NEXT: v_exp_f32_e32 v144, v22
818-
; GCN-NEXT: buffer_wbl2 sc0 sc1
819-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
820-
; GCN-NEXT: ds_write_b64 v137, v[16:17]
821812
; GCN-NEXT: ; implicit-def: $vgpr17
822813
; GCN-NEXT: ; implicit-def: $vgpr22
814+
; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
823815
; GCN-NEXT: buffer_wbl2 sc0 sc1
824816
; GCN-NEXT: s_waitcnt lgkmcnt(0)
825817
; GCN-NEXT: ds_write_b64 v138, v[42:43]
826818
; GCN-NEXT: v_add_u32_e32 v22, v132, v22
827819
; GCN-NEXT: v_add_u32_e32 v17, v132, v17
828-
; GCN-NEXT: ; implicit-def: $vgpr20
829-
; GCN-NEXT: ; implicit-def: $vgpr21
830820
; GCN-NEXT: s_waitcnt lgkmcnt(0)
831821
; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
832822
; GCN-NEXT: s_waitcnt vmcnt(0)
833823
; GCN-NEXT: buffer_inv sc0 sc1
834824
; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
835825
; GCN-NEXT: s_waitcnt vmcnt(0)
836826
; GCN-NEXT: buffer_inv sc0 sc1
827+
; GCN-NEXT: ; implicit-def: $vgpr20
828+
; GCN-NEXT: ; implicit-def: $vgpr21
829+
; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
837830
; GCN-NEXT: v_add_u32_e32 v20, v132, v20
838831
; GCN-NEXT: v_add_u32_e32 v21, v132, v21
839-
; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
840832
; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
841833
; GCN-NEXT: s_waitcnt vmcnt(0)
842834
; GCN-NEXT: buffer_inv sc0 sc1
@@ -967,27 +959,27 @@
967959
; GCN-NEXT: buffer_wbl2 sc0 sc1
968960
; GCN-NEXT: s_waitcnt lgkmcnt(0)
969961
; GCN-NEXT: ds_write_b64 v136, v[20:21]
962+
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
963+
; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
964+
; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
970965
; GCN-NEXT: buffer_wbl2 sc0 sc1
971966
; GCN-NEXT: s_waitcnt lgkmcnt(0)
972967
; GCN-NEXT: ds_write_b64 v137, v[0:1]
973968
; GCN-NEXT: buffer_wbl2 sc0 sc1
974969
; GCN-NEXT: s_waitcnt lgkmcnt(0)
975970
; GCN-NEXT: ds_write_b64 v138, v[26:27]
976-
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
977-
; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
978-
; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
971+
; GCN-NEXT: v_exp_f32_e32 v25, v6
979972
; GCN-NEXT: ;;#ASMSTART
980973
; GCN-NEXT: s_waitcnt vmcnt(8)
981974
; GCN-NEXT: ;;#ASMEND
982975
; GCN-NEXT: v_pack_b32_f16 v16, v37, v28
983976
; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134
984-
; GCN-NEXT: v_exp_f32_e32 v25, v6
977+
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
985978
; GCN-NEXT: s_waitcnt lgkmcnt(0)
986979
; GCN-NEXT: ds_read_b128 v[4:7], v139
987980
; GCN-NEXT: s_waitcnt lgkmcnt(0)
988981
; GCN-NEXT: buffer_inv sc0 sc1
989982
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
990-
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
991983
; GCN-NEXT: v_exp_f32_e32 v26, v0
992984
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29
993985
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150
@@ -1006,13 +998,13 @@
1006998
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25
1007999
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
10081000
; GCN-NEXT: v_pack_b32_f16 v17, v2, v0
1009-
; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
10101001
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
1011-
; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
1002+
; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
10121003
; GCN-NEXT: v_exp_f32_e32 v19, v0
10131004
; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152
10141005
; GCN-NEXT: s_waitcnt lgkmcnt(0)
10151006
; GCN-NEXT: buffer_inv sc0 sc1
1007+
; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
10161008
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
10171009
; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8
10181010
; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728
@@ -1021,41 +1013,41 @@
10211013
; GCN-NEXT: v_exp_f32_e32 v24, v4
10221014
; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28
10231015
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26
1024-
; GCN-NEXT: v_exp_f32_e32 v27, v4
1025-
; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
1016+
; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
10261017
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
1018+
; GCN-NEXT: v_exp_f32_e32 v27, v4
10271019
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29
1020+
; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
10281021
; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134
1029-
; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
10301022
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
10311023
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30
1032-
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
10331024
; GCN-NEXT: v_exp_f32_e32 v30, v0
1025+
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
10341026
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19
10351027
; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
10361028
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
10371029
; GCN-NEXT: v_exp_f32_e32 v16, v4
10381030
; GCN-NEXT: v_pack_b32_f16 v0, v5, v20
10391031
; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12
1040-
; GCN-NEXT: v_exp_f32_e32 v18, v9
1041-
; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
1042-
; GCN-NEXT: v_exp_f32_e32 v21, v9
10431032
; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134
1033+
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
1034+
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
10441035
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
1036+
; GCN-NEXT: v_exp_f32_e32 v18, v9
1037+
; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
10451038
; GCN-NEXT: ds_read_b128 v[4:7], v57
10461039
; GCN-NEXT: s_waitcnt lgkmcnt(0)
10471040
; GCN-NEXT: buffer_inv sc0 sc1
10481041
; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576
10491042
; GCN-NEXT: s_waitcnt lgkmcnt(0)
10501043
; GCN-NEXT: buffer_inv sc0 sc1
1051-
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
1052-
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
10531044
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
1054-
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
1045+
; GCN-NEXT: v_exp_f32_e32 v21, v9
10551046
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18
1047+
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
10561048
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
1057-
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
10581049
; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28
1050+
; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
10591051
; GCN-NEXT: v_exp_f32_e32 v2, v2
10601052
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
10611053
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -1110,29 +1102,30 @@
11101102
; GCN-NEXT: v_add_f32_e32 v3, v36, v3
11111103
; GCN-NEXT: v_add_f32_e32 v3, v39, v3
11121104
; GCN-NEXT: v_add_f32_e32 v3, v148, v3
1113-
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
11141105
; GCN-NEXT: v_add_f32_e32 v3, v34, v3
11151106
; GCN-NEXT: v_add_f32_e32 v3, v150, v3
1116-
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
1117-
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
11181107
; GCN-NEXT: v_add_f32_e32 v3, v38, v3
11191108
; GCN-NEXT: v_add_f32_e32 v3, v42, v3
11201109
; GCN-NEXT: v_add_f32_e32 v3, v25, v3
11211110
; GCN-NEXT: v_add_f32_e32 v3, v26, v3
1122-
; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
1123-
; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
11241111
; GCN-NEXT: v_add_f32_e32 v3, v29, v3
1112+
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
11251113
; GCN-NEXT: v_add_f32_e32 v3, v31, v3
1126-
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
11271114
; GCN-NEXT: v_add_f32_e32 v3, v19, v3
11281115
; GCN-NEXT: v_add_f32_e32 v3, v24, v3
1116+
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
1117+
; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
11291118
; GCN-NEXT: v_add_f32_e32 v3, v27, v3
11301119
; GCN-NEXT: v_add_f32_e32 v3, v30, v3
1120+
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
11311121
; GCN-NEXT: v_add_f32_e32 v3, v16, v3
11321122
; GCN-NEXT: v_add_f32_e32 v3, v18, v3
1123+
; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
1124+
; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
11331125
; GCN-NEXT: v_add_f32_e32 v3, v21, v3
1134-
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
1126+
; GCN-NEXT: s_nop 0
11351127
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
1128+
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
11361129
; GCN-NEXT: v_add_f32_e32 v0, v2, v3
11371130
; GCN-NEXT: v_add_f32_e32 v4, v10, v0
11381131
; GCN-NEXT: ds_bpermute_b32 v5, v133, v4

0 commit comments

Comments
 (0)