|
6 | 6 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
|
7 | 7 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
|
8 | 8 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
|
| 9 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-TRUE16 %s |
| 10 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-FAKE16 %s |
9 | 11 |
|
10 | 12 | define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
|
11 | 13 | ; SI-LABEL: test_fmax3_olt_0_f32:
|
@@ -157,6 +159,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
|
157 | 159 | ; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
|
158 | 160 | ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
|
159 | 161 | ; GFX12-NEXT: s_endpgm
|
| 162 | +; |
| 163 | +; GFX1250-LABEL: test_fmax3_olt_0_f32: |
| 164 | +; GFX1250: ; %bb.0: |
| 165 | +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 166 | +; GFX1250-NEXT: s_mov_b32 s10, -1 |
| 167 | +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 |
| 168 | +; GFX1250-NEXT: s_mov_b32 s14, s10 |
| 169 | +; GFX1250-NEXT: s_mov_b32 s15, s11 |
| 170 | +; GFX1250-NEXT: s_mov_b32 s18, s10 |
| 171 | +; GFX1250-NEXT: s_mov_b32 s19, s11 |
| 172 | +; GFX1250-NEXT: s_mov_b32 s22, s10 |
| 173 | +; GFX1250-NEXT: s_mov_b32 s23, s11 |
| 174 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 175 | +; GFX1250-NEXT: s_mov_b32 s12, s2 |
| 176 | +; GFX1250-NEXT: s_mov_b32 s13, s3 |
| 177 | +; GFX1250-NEXT: s_mov_b32 s16, s4 |
| 178 | +; GFX1250-NEXT: s_mov_b32 s17, s5 |
| 179 | +; GFX1250-NEXT: s_mov_b32 s20, s6 |
| 180 | +; GFX1250-NEXT: s_mov_b32 s21, s7 |
| 181 | +; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS |
| 182 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 183 | +; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS |
| 184 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 185 | +; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS |
| 186 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 187 | +; GFX1250-NEXT: s_mov_b32 s8, s0 |
| 188 | +; GFX1250-NEXT: s_mov_b32 s9, s1 |
| 189 | +; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 |
| 190 | +; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null |
| 191 | +; GFX1250-NEXT: s_endpgm |
160 | 192 | %a = load volatile float, ptr addrspace(1) %aptr, align 4
|
161 | 193 | %b = load volatile float, ptr addrspace(1) %bptr, align 4
|
162 | 194 | %c = load volatile float, ptr addrspace(1) %cptr, align 4
|
@@ -317,6 +349,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
|
317 | 349 | ; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
|
318 | 350 | ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
|
319 | 351 | ; GFX12-NEXT: s_endpgm
|
| 352 | +; |
| 353 | +; GFX1250-LABEL: test_fmax3_olt_1_f32: |
| 354 | +; GFX1250: ; %bb.0: |
| 355 | +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 356 | +; GFX1250-NEXT: s_mov_b32 s10, -1 |
| 357 | +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 |
| 358 | +; GFX1250-NEXT: s_mov_b32 s14, s10 |
| 359 | +; GFX1250-NEXT: s_mov_b32 s15, s11 |
| 360 | +; GFX1250-NEXT: s_mov_b32 s18, s10 |
| 361 | +; GFX1250-NEXT: s_mov_b32 s19, s11 |
| 362 | +; GFX1250-NEXT: s_mov_b32 s22, s10 |
| 363 | +; GFX1250-NEXT: s_mov_b32 s23, s11 |
| 364 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 365 | +; GFX1250-NEXT: s_mov_b32 s12, s2 |
| 366 | +; GFX1250-NEXT: s_mov_b32 s13, s3 |
| 367 | +; GFX1250-NEXT: s_mov_b32 s16, s4 |
| 368 | +; GFX1250-NEXT: s_mov_b32 s17, s5 |
| 369 | +; GFX1250-NEXT: s_mov_b32 s20, s6 |
| 370 | +; GFX1250-NEXT: s_mov_b32 s21, s7 |
| 371 | +; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS |
| 372 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 373 | +; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS |
| 374 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 375 | +; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS |
| 376 | +; GFX1250-NEXT: s_wait_loadcnt 0x0 |
| 377 | +; GFX1250-NEXT: s_mov_b32 s8, s0 |
| 378 | +; GFX1250-NEXT: s_mov_b32 s9, s1 |
| 379 | +; GFX1250-NEXT: v_max3_num_f32 v0, v2, v0, v1 |
| 380 | +; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null |
| 381 | +; GFX1250-NEXT: s_endpgm |
320 | 382 | %a = load volatile float, ptr addrspace(1) %aptr, align 4
|
321 | 383 | %b = load volatile float, ptr addrspace(1) %bptr, align 4
|
322 | 384 | %c = load volatile float, ptr addrspace(1) %cptr, align 4
|
@@ -544,6 +606,66 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
|
544 | 606 | ; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
|
545 | 607 | ; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
|
546 | 608 | ; GFX12-FAKE16-NEXT: s_endpgm
|
| 609 | +; |
| 610 | +; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16: |
| 611 | +; GFX1250-TRUE16: ; %bb.0: |
| 612 | +; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 613 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 |
| 614 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 |
| 615 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 |
| 616 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 |
| 617 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 |
| 618 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 |
| 619 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 |
| 620 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 |
| 621 | +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 |
| 622 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 |
| 623 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 |
| 624 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 |
| 625 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 |
| 626 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 |
| 627 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 |
| 628 | +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS |
| 629 | +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 |
| 630 | +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS |
| 631 | +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 |
| 632 | +; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS |
| 633 | +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 |
| 634 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 |
| 635 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 |
| 636 | +; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l |
| 637 | +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null |
| 638 | +; GFX1250-TRUE16-NEXT: s_endpgm |
| 639 | +; |
| 640 | +; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16: |
| 641 | +; GFX1250-FAKE16: ; %bb.0: |
| 642 | +; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 643 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 |
| 644 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 |
| 645 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 |
| 646 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 |
| 647 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 |
| 648 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 |
| 649 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 |
| 650 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 |
| 651 | +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 |
| 652 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 |
| 653 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 |
| 654 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 |
| 655 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 |
| 656 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 |
| 657 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 |
| 658 | +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS |
| 659 | +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 |
| 660 | +; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS |
| 661 | +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 |
| 662 | +; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS |
| 663 | +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 |
| 664 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 |
| 665 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 |
| 666 | +; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 |
| 667 | +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null |
| 668 | +; GFX1250-FAKE16-NEXT: s_endpgm |
547 | 669 | %a = load volatile half, ptr addrspace(1) %aptr, align 2
|
548 | 670 | %b = load volatile half, ptr addrspace(1) %bptr, align 2
|
549 | 671 | %c = load volatile half, ptr addrspace(1) %cptr, align 2
|
@@ -772,6 +894,66 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
|
772 | 894 | ; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1
|
773 | 895 | ; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
|
774 | 896 | ; GFX12-FAKE16-NEXT: s_endpgm
|
| 897 | +; |
| 898 | +; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16: |
| 899 | +; GFX1250-TRUE16: ; %bb.0: |
| 900 | +; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 901 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 |
| 902 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 |
| 903 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 |
| 904 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 |
| 905 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 |
| 906 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 |
| 907 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 |
| 908 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 |
| 909 | +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 |
| 910 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 |
| 911 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 |
| 912 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 |
| 913 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 |
| 914 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 |
| 915 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 |
| 916 | +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS |
| 917 | +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 |
| 918 | +; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS |
| 919 | +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 |
| 920 | +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS |
| 921 | +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 |
| 922 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 |
| 923 | +; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 |
| 924 | +; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l |
| 925 | +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null |
| 926 | +; GFX1250-TRUE16-NEXT: s_endpgm |
| 927 | +; |
| 928 | +; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16: |
| 929 | +; GFX1250-FAKE16: ; %bb.0: |
| 930 | +; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 |
| 931 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 |
| 932 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 |
| 933 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 |
| 934 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 |
| 935 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 |
| 936 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 |
| 937 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 |
| 938 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 |
| 939 | +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 |
| 940 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 |
| 941 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 |
| 942 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 |
| 943 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 |
| 944 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 |
| 945 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 |
| 946 | +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS |
| 947 | +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 |
| 948 | +; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS |
| 949 | +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 |
| 950 | +; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS |
| 951 | +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 |
| 952 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 |
| 953 | +; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 |
| 954 | +; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1 |
| 955 | +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null |
| 956 | +; GFX1250-FAKE16-NEXT: s_endpgm |
775 | 957 | %a = load volatile half, ptr addrspace(1) %aptr, align 2
|
776 | 958 | %b = load volatile half, ptr addrspace(1) %bptr, align 2
|
777 | 959 | %c = load volatile half, ptr addrspace(1) %cptr, align 2
|
@@ -850,6 +1032,15 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
|
850 | 1032 | ; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
|
851 | 1033 | ; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
|
852 | 1034 | ; GFX12-NEXT: s_setpc_b64 s[30:31]
|
| 1035 | +; |
| 1036 | +; GFX1250-LABEL: no_fmax3_v2f16: |
| 1037 | +; GFX1250: ; %bb.0: ; %entry |
| 1038 | +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 |
| 1039 | +; GFX1250-NEXT: s_wait_kmcnt 0x0 |
| 1040 | +; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v1 |
| 1041 | +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 1042 | +; GFX1250-NEXT: v_pk_max3_num_f16 v0, v2, v0, v3 |
| 1043 | +; GFX1250-NEXT: s_set_pc_i64 s[30:31] |
853 | 1044 | entry:
|
854 | 1045 | %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
|
855 | 1046 | %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
|
|
0 commit comments