| 
464 | 464 |   ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1  | 
465 | 465 |   ; GCN-NEXT:    s_waitcnt vmcnt(0)  | 
466 | 466 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
467 |  | -  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134  | 
468 | 467 |   ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134  | 
469 |  | -  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134  | 
470 |  | -  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57  | 
471 | 468 |   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48  | 
472 | 469 |   ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134  | 
473 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v163, v57  | 
474 |  | -  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96  | 
475 | 470 |   ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134  | 
476 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v164, v57  | 
477 | 471 |   ; GCN-NEXT:    v_exp_f32_e32 v49, v48  | 
478 | 472 |   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64  | 
479 | 473 |   ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134  | 
 | 
495 | 489 |   ; GCN-NEXT:    ds_read_b128 v[140:143], v139  | 
496 | 490 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
497 | 491 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
 | 492 | +  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576  | 
 | 493 | +  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
 | 494 | +  ; GCN-NEXT:    buffer_inv sc0 sc1  | 
498 | 495 |   ; GCN-NEXT:    v_exp_f32_e32 v54, v48  | 
499 | 496 |   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70  | 
500 | 497 |   ; GCN-NEXT:    v_exp_f32_e32 v55, v48  | 
501 | 498 |   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71  | 
502 |  | -  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576  | 
503 |  | -  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
504 |  | -  ; GCN-NEXT:    buffer_inv sc0 sc1  | 
505 | 499 |   ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134  | 
506 | 500 |   ; GCN-NEXT:    v_exp_f32_e32 v56, v48  | 
507 | 501 |   ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134  | 
 | 502 | +  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152  | 
 | 503 | +  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
 | 504 | +  ; GCN-NEXT:    buffer_inv sc0 sc1  | 
508 | 505 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49  | 
509 | 506 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50  | 
510 | 507 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51  | 
 | 508 | +  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134  | 
511 | 509 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52  | 
512 | 510 |   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48  | 
513 |  | -  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152  | 
514 |  | -  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
515 |  | -  ; GCN-NEXT:    buffer_inv sc0 sc1  | 
516 | 511 |   ; GCN-NEXT:    v_exp_f32_e32 v48, v48  | 
 | 512 | +  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134  | 
517 | 513 |   ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58  | 
518 | 514 |   ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67  | 
519 | 515 |   ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66  | 
520 | 516 |   ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79  | 
521 | 517 |   ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728  | 
522 | 518 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
523 | 519 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
524 |  | -  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134  | 
525 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55  | 
526 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56  | 
 | 520 | +  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95  | 
527 | 521 |   ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]  | 
528 | 522 |   ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]  | 
529 | 523 |   ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]  | 
 | 
532 | 526 |   ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]  | 
533 | 527 |   ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]  | 
534 | 528 |   ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]  | 
535 |  | -  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95  | 
536 |  | -  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134  | 
 | 529 | +  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57  | 
537 | 530 |   ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]  | 
538 |  | -  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]  | 
539 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v82, v82, v48  | 
540 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v83, v83, v48  | 
541 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v84, v84, v48  | 
542 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v85, v85, v48  | 
543 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v86, v86, v48  | 
544 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v87, v87, v48  | 
 | 531 | +  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]  | 
 | 532 | +  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]  | 
 | 533 | +  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]  | 
545 | 534 |   ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]  | 
546 | 535 |   ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]  | 
547 | 536 |   ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]  | 
548 | 537 |   ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]  | 
549 |  | -  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111  | 
 | 538 | +  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]  | 
550 | 539 |   ; GCN-NEXT:    v_exp_f32_e32 v58, v58  | 
551 |  | -  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]  | 
 | 540 | +  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134  | 
 | 541 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55  | 
 | 542 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53  | 
 | 543 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54  | 
 | 544 | +  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134  | 
 | 545 | +  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134  | 
552 | 546 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]  | 
553 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v98, v98, v48  | 
554 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v99, v99, v48  | 
555 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v100, v100, v48  | 
556 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v101, v101, v48  | 
557 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v102, v102, v48  | 
558 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v103, v103, v48  | 
 | 547 | +  ; GCN-NEXT:    v_exp_f32_e32 v163, v57  | 
 | 548 | +  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96  | 
 | 549 | +  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111  | 
 | 550 | +  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]  | 
 | 551 | +  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]  | 
 | 552 | +  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]  | 
 | 553 | +  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]  | 
 | 554 | +  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]  | 
559 | 555 |   ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]  | 
560 | 556 |   ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]  | 
561 | 557 |   ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]  | 
562 | 558 |   ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]  | 
563 |  | -  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57  | 
564 |  | -  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59  | 
565 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53  | 
566 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54  | 
567 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v59, v57  | 
568 |  | -  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]  | 
569 |  | -  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134  | 
570 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v112, v112, v48  | 
571 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v113, v113, v48  | 
572 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v114, v114, v48  | 
573 |  | -  ; GCN-NEXT:    v_mul_f32_e64 v115, v115, v48  | 
 | 559 | +  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]  | 
574 | 560 |   ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]  | 
 | 561 | +  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]  | 
 | 562 | +  ; GCN-NEXT:    v_exp_f32_e32 v164, v57  | 
 | 563 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56  | 
575 | 564 |   ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]  | 
576 | 565 |   ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]  | 
577 | 566 |   ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]  | 
578 | 567 |   ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]  | 
579 | 568 |   ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]  | 
 | 569 | +  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57  | 
 | 570 | +  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59  | 
580 | 571 |   ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134  | 
581 |  | -  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141  | 
582 | 572 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]  | 
583 | 573 |   ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134  | 
 | 574 | +  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141  | 
 | 575 | +  ; GCN-NEXT:    v_exp_f32_e32 v59, v57  | 
584 | 576 |   ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60  | 
585 | 577 |   ; GCN-NEXT:    ; implicit-def: $vgpr57  | 
586 | 578 |   ; GCN-NEXT:    ds_read_b128 v[60:63], v57  | 
587 | 579 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
588 | 580 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
589 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v160, v149  | 
590 | 581 |   ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134  | 
591 | 582 |   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148  | 
592 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58  | 
593 | 583 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]  | 
 | 584 | +  ; GCN-NEXT:    v_exp_f32_e32 v160, v149  | 
 | 585 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58  | 
594 | 586 |   ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134  | 
595 | 587 |   ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576  | 
596 | 588 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
597 | 589 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
598 | 590 |   ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134  | 
599 | 591 |   ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134  | 
600 | 592 |   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134  | 
601 |  | -  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134  | 
602 |  | -  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134  | 
603 | 593 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]  | 
604 | 594 |   ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162  | 
605 | 595 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163  | 
606 | 596 |   ; GCN-NEXT:    v_exp_f32_e32 v162, v146  | 
607 | 597 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164  | 
608 |  | -  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134  | 
 | 598 | +  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134  | 
609 | 599 |   ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147  | 
610 |  | -  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134  | 
 | 600 | +  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134  | 
611 | 601 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]  | 
612 | 602 |   ; GCN-NEXT:    v_exp_f32_e32 v151, v33  | 
613 | 603 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59  | 
614 | 604 |   ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134  | 
615 |  | -  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134  | 
616 |  | -  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134  | 
 | 605 | +  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134  | 
 | 606 | +  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134  | 
617 | 607 |   ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33  | 
618 | 608 |   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152  | 
619 | 609 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]  | 
 | 
622 | 612 |   ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134  | 
623 | 613 |   ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5  | 
624 | 614 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160  | 
 | 615 | +  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134  | 
 | 616 | +  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134  | 
625 | 617 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]  | 
626 | 618 |   ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32  | 
627 | 619 |   ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152  | 
 | 
795 | 787 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158  | 
796 | 788 |   ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5  | 
797 | 789 |   ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8  | 
798 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63  | 
799 | 790 |   ; GCN-NEXT:    ;;#ASMSTART  | 
800 | 791 |   ; GCN-NEXT:    s_waitcnt vmcnt(8)  | 
801 | 792 |   ; GCN-NEXT:    ;;#ASMEND  | 
802 | 793 |   ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
803 | 794 |   ; GCN-NEXT:    ds_write_b64 v135, v[20:21]  | 
 | 795 | +  ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
 | 796 | +  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
 | 797 | +  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]  | 
804 | 798 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]  | 
805 | 799 |   ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5  | 
806 | 800 |   ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134  | 
 | 
810 | 804 |   ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5  | 
811 | 805 |   ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
812 | 806 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
813 |  | -  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]  | 
 | 807 | +  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]  | 
814 | 808 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]  | 
815 | 809 |   ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22  | 
816 | 810 |   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60  | 
817 | 811 |   ; GCN-NEXT:    v_exp_f32_e32 v144, v22  | 
818 |  | -  ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
819 |  | -  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
820 |  | -  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]  | 
821 | 812 |   ; GCN-NEXT:    ; implicit-def: $vgpr17  | 
822 | 813 |   ; GCN-NEXT:    ; implicit-def: $vgpr22  | 
 | 814 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63  | 
823 | 815 |   ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
824 | 816 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
825 | 817 |   ; GCN-NEXT:    ds_write_b64 v138, v[42:43]  | 
826 | 818 |   ; GCN-NEXT:    v_add_u32_e32 v22, v132, v22  | 
827 | 819 |   ; GCN-NEXT:    v_add_u32_e32 v17, v132, v17  | 
828 |  | -  ; GCN-NEXT:    ; implicit-def: $vgpr20  | 
829 |  | -  ; GCN-NEXT:    ; implicit-def: $vgpr21  | 
830 | 820 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
831 | 821 |   ; GCN-NEXT:    buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1  | 
832 | 822 |   ; GCN-NEXT:    s_waitcnt vmcnt(0)  | 
833 | 823 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
834 | 824 |   ; GCN-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1  | 
835 | 825 |   ; GCN-NEXT:    s_waitcnt vmcnt(0)  | 
836 | 826 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
 | 827 | +  ; GCN-NEXT:    ; implicit-def: $vgpr20  | 
 | 828 | +  ; GCN-NEXT:    ; implicit-def: $vgpr21  | 
 | 829 | +  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44  | 
837 | 830 |   ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20  | 
838 | 831 |   ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21  | 
839 |  | -  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44  | 
840 | 832 |   ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1  | 
841 | 833 |   ; GCN-NEXT:    s_waitcnt vmcnt(0)  | 
842 | 834 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
 | 
967 | 959 |   ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
968 | 960 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
969 | 961 |   ; GCN-NEXT:    ds_write_b64 v136, v[20:21]  | 
 | 962 | +  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]  | 
 | 963 | +  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6  | 
 | 964 | +  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32  | 
970 | 965 |   ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
971 | 966 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
972 | 967 |   ; GCN-NEXT:    ds_write_b64 v137, v[0:1]  | 
973 | 968 |   ; GCN-NEXT:    buffer_wbl2 sc0 sc1  | 
974 | 969 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
975 | 970 |   ; GCN-NEXT:    ds_write_b64 v138, v[26:27]  | 
976 |  | -  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]  | 
977 |  | -  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6  | 
978 |  | -  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32  | 
 | 971 | +  ; GCN-NEXT:    v_exp_f32_e32 v25, v6  | 
979 | 972 |   ; GCN-NEXT:    ;;#ASMSTART  | 
980 | 973 |   ; GCN-NEXT:    s_waitcnt vmcnt(8)  | 
981 | 974 |   ; GCN-NEXT:    ;;#ASMEND  | 
982 | 975 |   ; GCN-NEXT:    v_pack_b32_f16 v16, v37, v28  | 
983 | 976 |   ; GCN-NEXT:    v_fma_f32 v24, s4, v7, -v134  | 
984 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v25, v6  | 
 | 977 | +  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149  | 
985 | 978 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
986 | 979 |   ; GCN-NEXT:    ds_read_b128 v[4:7], v139  | 
987 | 980 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
988 | 981 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
989 | 982 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]  | 
990 |  | -  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149  | 
991 | 983 |   ; GCN-NEXT:    v_exp_f32_e32 v26, v0  | 
992 | 984 |   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29  | 
993 | 985 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150  | 
 | 
1006 | 998 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v25  | 
1007 | 999 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]  | 
1008 | 1000 |   ; GCN-NEXT:    v_pack_b32_f16 v17, v2, v0  | 
1009 |  | -  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27  | 
1010 | 1001 |   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24  | 
1011 |  | -  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134  | 
 | 1002 | +  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27  | 
1012 | 1003 |   ; GCN-NEXT:    v_exp_f32_e32 v19, v0  | 
1013 | 1004 |   ; GCN-NEXT:    ds_read_b128 v[0:3], v139 offset:1152  | 
1014 | 1005 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
1015 | 1006 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
 | 1007 | +  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134  | 
1016 | 1008 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]  | 
1017 | 1009 |   ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8  | 
1018 | 1010 |   ; GCN-NEXT:    ds_read_b128 v[8:11], v139 offset:1728  | 
 | 
1021 | 1013 |   ; GCN-NEXT:    v_exp_f32_e32 v24, v4  | 
1022 | 1014 |   ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v28  | 
1023 | 1015 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v26  | 
1024 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v27, v4  | 
1025 |  | -  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18  | 
 | 1016 | +  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134  | 
1026 | 1017 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]  | 
 | 1018 | +  ; GCN-NEXT:    v_exp_f32_e32 v27, v4  | 
1027 | 1019 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v29  | 
 | 1020 | +  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18  | 
1028 | 1021 |   ; GCN-NEXT:    v_fma_f32 v21, s4, v13, -v134  | 
1029 |  | -  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134  | 
1030 | 1022 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]  | 
1031 | 1023 |   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30  | 
1032 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31  | 
1033 | 1024 |   ; GCN-NEXT:    v_exp_f32_e32 v30, v0  | 
 | 1025 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31  | 
1034 | 1026 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19  | 
1035 | 1027 |   ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0  | 
1036 | 1028 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]  | 
1037 | 1029 |   ; GCN-NEXT:    v_exp_f32_e32 v16, v4  | 
1038 | 1030 |   ; GCN-NEXT:    v_pack_b32_f16 v0, v5, v20  | 
1039 | 1031 |   ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v12  | 
1040 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v18, v9  | 
1041 |  | -  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21  | 
1042 |  | -  ; GCN-NEXT:    v_exp_f32_e32 v21, v9  | 
1043 | 1032 |   ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134  | 
 | 1033 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24  | 
 | 1034 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27  | 
1044 | 1035 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]  | 
 | 1036 | +  ; GCN-NEXT:    v_exp_f32_e32 v18, v9  | 
 | 1037 | +  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21  | 
1045 | 1038 |   ; GCN-NEXT:    ds_read_b128 v[4:7], v57  | 
1046 | 1039 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
1047 | 1040 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
1048 | 1041 |   ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576  | 
1049 | 1042 |   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)  | 
1050 | 1043 |   ; GCN-NEXT:    buffer_inv sc0 sc1  | 
1051 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24  | 
1052 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27  | 
1053 | 1044 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]  | 
1054 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21  | 
 | 1045 | +  ; GCN-NEXT:    v_exp_f32_e32 v21, v9  | 
1055 | 1046 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v18  | 
 | 1047 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21  | 
1056 | 1048 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]  | 
1057 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30  | 
1058 | 1049 |   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28  | 
 | 1050 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30  | 
1059 | 1051 |   ; GCN-NEXT:    v_exp_f32_e32 v2, v2  | 
1060 | 1052 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]  | 
1061 | 1053 |   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v16  | 
 | 
1110 | 1102 |   ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3  | 
1111 | 1103 |   ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3  | 
1112 | 1104 |   ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3  | 
1113 |  | -  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]  | 
1114 | 1105 |   ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3  | 
1115 | 1106 |   ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3  | 
1116 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10  | 
1117 |  | -  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2  | 
1118 | 1107 |   ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3  | 
1119 | 1108 |   ; GCN-NEXT:    v_add_f32_e32 v3, v42, v3  | 
1120 | 1109 |   ; GCN-NEXT:    v_add_f32_e32 v3, v25, v3  | 
1121 | 1110 |   ; GCN-NEXT:    v_add_f32_e32 v3, v26, v3  | 
1122 |  | -  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1  | 
1123 |  | -  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22  | 
1124 | 1111 |   ; GCN-NEXT:    v_add_f32_e32 v3, v29, v3  | 
 | 1112 | +  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]  | 
1125 | 1113 |   ; GCN-NEXT:    v_add_f32_e32 v3, v31, v3  | 
1126 |  | -  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]  | 
1127 | 1114 |   ; GCN-NEXT:    v_add_f32_e32 v3, v19, v3  | 
1128 | 1115 |   ; GCN-NEXT:    v_add_f32_e32 v3, v24, v3  | 
 | 1116 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10  | 
 | 1117 | +  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2  | 
1129 | 1118 |   ; GCN-NEXT:    v_add_f32_e32 v3, v27, v3  | 
1130 | 1119 |   ; GCN-NEXT:    v_add_f32_e32 v3, v30, v3  | 
 | 1120 | +  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]  | 
1131 | 1121 |   ; GCN-NEXT:    v_add_f32_e32 v3, v16, v3  | 
1132 | 1122 |   ; GCN-NEXT:    v_add_f32_e32 v3, v18, v3  | 
 | 1123 | +  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1  | 
 | 1124 | +  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22  | 
1133 | 1125 |   ; GCN-NEXT:    v_add_f32_e32 v3, v21, v3  | 
1134 |  | -  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]  | 
 | 1126 | +  ; GCN-NEXT:    s_nop 0  | 
1135 | 1127 |   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]  | 
 | 1128 | +  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]  | 
1136 | 1129 |   ; GCN-NEXT:    v_add_f32_e32 v0, v2, v3  | 
1137 | 1130 |   ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0  | 
1138 | 1131 |   ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4  | 
 | 
0 commit comments