|
464 | 464 | ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1 |
465 | 465 | ; GCN-NEXT: s_waitcnt vmcnt(0) |
466 | 466 | ; GCN-NEXT: buffer_inv sc0 sc1 |
| 467 | + ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 |
467 | 468 | ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134 |
| 469 | + ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 |
| 470 | + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 |
468 | 471 | ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 |
469 | 472 | ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134 |
| 473 | + ; GCN-NEXT: v_exp_f32_e32 v163, v57 |
| 474 | + ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 |
470 | 475 | ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134 |
| 476 | + ; GCN-NEXT: v_exp_f32_e32 v164, v57 |
471 | 477 | ; GCN-NEXT: v_exp_f32_e32 v49, v48 |
472 | 478 | ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64 |
473 | 479 | ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134 |
|
489 | 495 | ; GCN-NEXT: ds_read_b128 v[140:143], v139 |
490 | 496 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
491 | 497 | ; GCN-NEXT: buffer_inv sc0 sc1 |
492 | | - ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 |
493 | | - ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
494 | | - ; GCN-NEXT: buffer_inv sc0 sc1 |
495 | 498 | ; GCN-NEXT: v_exp_f32_e32 v54, v48 |
496 | 499 | ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70 |
497 | 500 | ; GCN-NEXT: v_exp_f32_e32 v55, v48 |
498 | 501 | ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71 |
| 502 | + ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576 |
| 503 | + ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| 504 | + ; GCN-NEXT: buffer_inv sc0 sc1 |
499 | 505 | ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134 |
500 | 506 | ; GCN-NEXT: v_exp_f32_e32 v56, v48 |
501 | 507 | ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134 |
502 | | - ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 |
503 | | - ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
504 | | - ; GCN-NEXT: buffer_inv sc0 sc1 |
505 | 508 | ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49 |
506 | 509 | ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50 |
507 | 510 | ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51 |
508 | | - ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134 |
509 | 511 | ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52 |
510 | 512 | ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48 |
| 513 | + ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152 |
| 514 | + ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| 515 | + ; GCN-NEXT: buffer_inv sc0 sc1 |
511 | 516 | ; GCN-NEXT: v_exp_f32_e32 v48, v48 |
512 | | - ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134 |
513 | 517 | ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58 |
514 | 518 | ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67 |
515 | 519 | ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66 |
516 | 520 | ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 |
517 | 521 | ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728 |
518 | 522 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
519 | 523 | ; GCN-NEXT: buffer_inv sc0 sc1 |
520 | | - ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 |
| 524 | + ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 |
| 525 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 |
| 526 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 |
521 | 527 | ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0] |
522 | 528 | ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0] |
523 | 529 | ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0] |
|
526 | 532 | ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0] |
527 | 533 | ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0] |
528 | 534 | ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0] |
529 | | - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57 |
| 535 | + ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 |
| 536 | + ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 |
530 | 537 | ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] |
531 | 538 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] |
532 | 539 | ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 |
|
535 | 542 | ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 |
536 | 543 | ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 |
537 | 544 | ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 |
538 | | - ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] |
539 | | - ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] |
540 | | - ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] |
541 | 545 | ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] |
542 | 546 | ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] |
543 | 547 | ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] |
544 | 548 | ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0] |
545 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] |
| 549 | + ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 |
546 | 550 | ; GCN-NEXT: v_exp_f32_e32 v58, v58 |
547 | | - ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134 |
548 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55 |
549 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53 |
550 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54 |
551 | | - ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 |
552 | | - ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 |
| 551 | + ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] |
553 | 552 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] |
554 | 553 | ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 |
555 | 554 | ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 |
556 | 555 | ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 |
557 | 556 | ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 |
558 | 557 | ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 |
559 | 558 | ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 |
560 | | - ; GCN-NEXT: v_exp_f32_e32 v163, v57 |
561 | | - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96 |
562 | | - ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 |
563 | | - ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] |
564 | | - ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] |
565 | | - ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] |
566 | | - ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] |
567 | | - ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] |
568 | 559 | ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] |
569 | 560 | ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] |
570 | 561 | ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] |
|
580 | 571 | ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 |
581 | 572 | ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 |
582 | 573 | ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 |
583 | | - ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] |
584 | 574 | ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] |
585 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] |
586 | | - ; GCN-NEXT: v_exp_f32_e32 v164, v57 |
587 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56 |
588 | 575 | ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] |
589 | 576 | ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] |
590 | 577 | ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0] |
591 | 578 | ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0] |
592 | 579 | ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0] |
593 | | - ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57 |
594 | | - ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59 |
595 | 580 | ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134 |
| 581 | + ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 |
596 | 582 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127] |
597 | 583 | ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134 |
598 | | - ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141 |
599 | | - ; GCN-NEXT: v_exp_f32_e32 v59, v57 |
600 | 584 | ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60 |
601 | 585 | ; GCN-NEXT: ; implicit-def: $vgpr57 |
602 | 586 | ; GCN-NEXT: ds_read_b128 v[60:63], v57 |
603 | 587 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
604 | 588 | ; GCN-NEXT: buffer_inv sc0 sc1 |
| 589 | + ; GCN-NEXT: v_exp_f32_e32 v160, v149 |
605 | 590 | ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134 |
606 | 591 | ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148 |
607 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] |
608 | | - ; GCN-NEXT: v_exp_f32_e32 v160, v149 |
609 | 592 | ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58 |
| 593 | + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79] |
610 | 594 | ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134 |
611 | 595 | ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576 |
612 | 596 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
613 | 597 | ; GCN-NEXT: buffer_inv sc0 sc1 |
614 | 598 | ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134 |
615 | 599 | ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134 |
616 | 600 | ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134 |
| 601 | + ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 |
| 602 | + ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 |
617 | 603 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95] |
618 | 604 | ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162 |
619 | 605 | ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163 |
620 | 606 | ; GCN-NEXT: v_exp_f32_e32 v162, v146 |
621 | 607 | ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164 |
622 | | - ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134 |
| 608 | + ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 |
623 | 609 | ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147 |
624 | | - ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134 |
| 610 | + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 |
625 | 611 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111] |
626 | 612 | ; GCN-NEXT: v_exp_f32_e32 v151, v33 |
627 | 613 | ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59 |
628 | 614 | ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134 |
629 | | - ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134 |
630 | | - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134 |
| 615 | + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 |
| 616 | + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 |
631 | 617 | ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33 |
632 | 618 | ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152 |
633 | 619 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127] |
|
636 | 622 | ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134 |
637 | 623 | ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5 |
638 | 624 | ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160 |
639 | | - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134 |
640 | | - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134 |
641 | 625 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79] |
642 | 626 | ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32 |
643 | 627 | ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152 |
|
811 | 795 | ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158 |
812 | 796 | ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5 |
813 | 797 | ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8 |
| 798 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 |
814 | 799 | ; GCN-NEXT: ;;#ASMSTART |
815 | 800 | ; GCN-NEXT: s_waitcnt vmcnt(8) |
816 | 801 | ; GCN-NEXT: ;;#ASMEND |
817 | 802 | ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
818 | 803 | ; GCN-NEXT: ds_write_b64 v135, v[20:21] |
819 | | - ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
820 | | - ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
821 | | - ; GCN-NEXT: ds_write_b64 v136, v[36:37] |
822 | 804 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111] |
823 | 805 | ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5 |
824 | 806 | ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134 |
|
828 | 810 | ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5 |
829 | 811 | ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
830 | 812 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
831 | | - ; GCN-NEXT: ds_write_b64 v137, v[16:17] |
| 813 | + ; GCN-NEXT: ds_write_b64 v136, v[36:37] |
832 | 814 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127] |
833 | 815 | ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22 |
834 | 816 | ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60 |
835 | 817 | ; GCN-NEXT: v_exp_f32_e32 v144, v22 |
| 818 | + ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
| 819 | + ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| 820 | + ; GCN-NEXT: ds_write_b64 v137, v[16:17] |
836 | 821 | ; GCN-NEXT: ; implicit-def: $vgpr17 |
837 | 822 | ; GCN-NEXT: ; implicit-def: $vgpr22 |
838 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63 |
839 | 823 | ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
840 | 824 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
841 | 825 | ; GCN-NEXT: ds_write_b64 v138, v[42:43] |
842 | 826 | ; GCN-NEXT: v_add_u32_e32 v22, v132, v22 |
843 | 827 | ; GCN-NEXT: v_add_u32_e32 v17, v132, v17 |
| 828 | + ; GCN-NEXT: ; implicit-def: $vgpr20 |
| 829 | + ; GCN-NEXT: ; implicit-def: $vgpr21 |
844 | 830 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
845 | 831 | ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1 |
846 | 832 | ; GCN-NEXT: s_waitcnt vmcnt(0) |
847 | 833 | ; GCN-NEXT: buffer_inv sc0 sc1 |
848 | 834 | ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1 |
849 | 835 | ; GCN-NEXT: s_waitcnt vmcnt(0) |
850 | 836 | ; GCN-NEXT: buffer_inv sc0 sc1 |
851 | | - ; GCN-NEXT: ; implicit-def: $vgpr20 |
852 | | - ; GCN-NEXT: ; implicit-def: $vgpr21 |
853 | | - ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 |
854 | 837 | ; GCN-NEXT: v_add_u32_e32 v20, v132, v20 |
855 | 838 | ; GCN-NEXT: v_add_u32_e32 v21, v132, v21 |
| 839 | + ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44 |
856 | 840 | ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1 |
857 | 841 | ; GCN-NEXT: s_waitcnt vmcnt(0) |
858 | 842 | ; GCN-NEXT: buffer_inv sc0 sc1 |
|
983 | 967 | ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
984 | 968 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
985 | 969 | ; GCN-NEXT: ds_write_b64 v136, v[20:21] |
986 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] |
987 | | - ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 |
988 | | - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 |
989 | 970 | ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
990 | 971 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
991 | 972 | ; GCN-NEXT: ds_write_b64 v137, v[0:1] |
992 | 973 | ; GCN-NEXT: buffer_wbl2 sc0 sc1 |
993 | 974 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
994 | 975 | ; GCN-NEXT: ds_write_b64 v138, v[26:27] |
995 | | - ; GCN-NEXT: v_exp_f32_e32 v25, v6 |
| 976 | + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127] |
| 977 | + ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6 |
| 978 | + ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32 |
996 | 979 | ; GCN-NEXT: ;;#ASMSTART |
997 | 980 | ; GCN-NEXT: s_waitcnt vmcnt(8) |
998 | 981 | ; GCN-NEXT: ;;#ASMEND |
999 | 982 | ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28 |
1000 | 983 | ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134 |
1001 | | - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 |
| 984 | + ; GCN-NEXT: v_exp_f32_e32 v25, v6 |
1002 | 985 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
1003 | 986 | ; GCN-NEXT: ds_read_b128 v[4:7], v139 |
1004 | 987 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
1005 | 988 | ; GCN-NEXT: buffer_inv sc0 sc1 |
1006 | 989 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79] |
| 990 | + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149 |
1007 | 991 | ; GCN-NEXT: v_exp_f32_e32 v26, v0 |
1008 | 992 | ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29 |
1009 | 993 | ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150 |
|
1022 | 1006 | ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 |
1023 | 1007 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127] |
1024 | 1008 | ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0 |
1025 | | - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 |
1026 | 1009 | ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27 |
| 1010 | + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24 |
| 1011 | + ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 |
1027 | 1012 | ; GCN-NEXT: v_exp_f32_e32 v19, v0 |
1028 | 1013 | ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152 |
1029 | 1014 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
1030 | 1015 | ; GCN-NEXT: buffer_inv sc0 sc1 |
1031 | | - ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134 |
1032 | 1016 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79] |
1033 | 1017 | ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8 |
1034 | 1018 | ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728 |
|
1037 | 1021 | ; GCN-NEXT: v_exp_f32_e32 v24, v4 |
1038 | 1022 | ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28 |
1039 | 1023 | ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26 |
1040 | | - ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 |
1041 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] |
1042 | 1024 | ; GCN-NEXT: v_exp_f32_e32 v27, v4 |
1043 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 |
1044 | 1025 | ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18 |
| 1026 | + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95] |
| 1027 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 |
1045 | 1028 | ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134 |
| 1029 | + ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134 |
1046 | 1030 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111] |
1047 | 1031 | ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30 |
1048 | | - ; GCN-NEXT: v_exp_f32_e32 v30, v0 |
1049 | 1032 | ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 |
| 1033 | + ; GCN-NEXT: v_exp_f32_e32 v30, v0 |
1050 | 1034 | ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 |
1051 | 1035 | ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 |
1052 | 1036 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127] |
1053 | 1037 | ; GCN-NEXT: v_exp_f32_e32 v16, v4 |
1054 | 1038 | ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20 |
1055 | 1039 | ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12 |
1056 | | - ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 |
1057 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 |
1058 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 |
1059 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] |
1060 | 1040 | ; GCN-NEXT: v_exp_f32_e32 v18, v9 |
1061 | 1041 | ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21 |
| 1042 | + ; GCN-NEXT: v_exp_f32_e32 v21, v9 |
| 1043 | + ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134 |
| 1044 | + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] |
1062 | 1045 | ; GCN-NEXT: ds_read_b128 v[4:7], v57 |
1063 | 1046 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
1064 | 1047 | ; GCN-NEXT: buffer_inv sc0 sc1 |
1065 | 1048 | ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576 |
1066 | 1049 | ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
1067 | 1050 | ; GCN-NEXT: buffer_inv sc0 sc1 |
| 1051 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 |
| 1052 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 |
1068 | 1053 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95] |
1069 | | - ; GCN-NEXT: v_exp_f32_e32 v21, v9 |
1070 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 |
1071 | 1054 | ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21 |
| 1055 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18 |
1072 | 1056 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111] |
1073 | | - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 |
1074 | 1057 | ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 |
| 1058 | + ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28 |
1075 | 1059 | ; GCN-NEXT: v_exp_f32_e32 v2, v2 |
1076 | 1060 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127] |
1077 | 1061 | ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 |
|
1126 | 1110 | ; GCN-NEXT: v_add_f32_e32 v3, v36, v3 |
1127 | 1111 | ; GCN-NEXT: v_add_f32_e32 v3, v39, v3 |
1128 | 1112 | ; GCN-NEXT: v_add_f32_e32 v3, v148, v3 |
| 1113 | + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] |
1129 | 1114 | ; GCN-NEXT: v_add_f32_e32 v3, v34, v3 |
1130 | 1115 | ; GCN-NEXT: v_add_f32_e32 v3, v150, v3 |
| 1116 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 |
| 1117 | + ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 |
1131 | 1118 | ; GCN-NEXT: v_add_f32_e32 v3, v38, v3 |
1132 | 1119 | ; GCN-NEXT: v_add_f32_e32 v3, v42, v3 |
1133 | 1120 | ; GCN-NEXT: v_add_f32_e32 v3, v25, v3 |
1134 | 1121 | ; GCN-NEXT: v_add_f32_e32 v3, v26, v3 |
| 1122 | + ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 |
| 1123 | + ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 |
1135 | 1124 | ; GCN-NEXT: v_add_f32_e32 v3, v29, v3 |
1136 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] |
1137 | 1125 | ; GCN-NEXT: v_add_f32_e32 v3, v31, v3 |
| 1126 | + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] |
1138 | 1127 | ; GCN-NEXT: v_add_f32_e32 v3, v19, v3 |
1139 | 1128 | ; GCN-NEXT: v_add_f32_e32 v3, v24, v3 |
1140 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10 |
1141 | | - ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 |
1142 | 1129 | ; GCN-NEXT: v_add_f32_e32 v3, v27, v3 |
1143 | 1130 | ; GCN-NEXT: v_add_f32_e32 v3, v30, v3 |
1144 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95] |
1145 | 1131 | ; GCN-NEXT: v_add_f32_e32 v3, v16, v3 |
1146 | 1132 | ; GCN-NEXT: v_add_f32_e32 v3, v18, v3 |
1147 | | - ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1 |
1148 | | - ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22 |
1149 | 1133 | ; GCN-NEXT: v_add_f32_e32 v3, v21, v3 |
1150 | | - ; GCN-NEXT: s_nop 0 |
| 1134 | + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79] |
1151 | 1135 | ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79] |
1152 | | - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95] |
1153 | 1136 | ; GCN-NEXT: v_add_f32_e32 v0, v2, v3 |
1154 | 1137 | ; GCN-NEXT: v_add_f32_e32 v4, v10, v0 |
1155 | 1138 | ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4 |
|
0 commit comments