Skip to content

Commit 4fee112

Browse files
changpengrampitec
authored andcommitted
[AMDGPU] Support V_PK_MIN3/MAX3_NUM_F16 on gfx1250 (llvm#150326)
Co-authored-by: Stanislav Mekhanoshin <[email protected]>
1 parent e3ee9ad commit 4fee112

File tree

8 files changed

+710
-2
lines changed

8 files changed

+710
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
167167
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
168168
>;
169169

170+
def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16",
171+
"HasMin3Max3PKF16",
172+
"true",
173+
"Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions"
174+
>;
175+
170176
def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
171177
"HasMinimum3Maximum3PKF16",
172178
"true",
@@ -2001,6 +2007,7 @@ def FeatureISAVersion12_50 : FeatureSet<
20012007
FeatureBF16ConversionInsts,
20022008
FeatureBF16PackedInsts,
20032009
FeatureCvtPkF16F32Inst,
2010+
FeatureMin3Max3PKF16,
20042011
FeatureMinimum3Maximum3PKF16,
20052012
FeaturePrngInst,
20062013
FeaturePermlane16Swap,
@@ -2361,6 +2368,10 @@ def HasMinimum3Maximum3F16 :
23612368
Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
23622369
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
23632370

2371+
def HasMin3Max3PKF16 :
2372+
Predicate<"Subtarget->hasMin3Max3PKF16()">,
2373+
AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>;
2374+
23642375
def HasMinimum3Maximum3PKF16 :
23652376
Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
23662377
AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
265265
bool HasIEEEMinimumMaximumInsts = false;
266266
bool HasMinimum3Maximum3F32 = false;
267267
bool HasMinimum3Maximum3F16 = false;
268+
bool HasMin3Max3PKF16 = false;
268269
bool HasMinimum3Maximum3PKF16 = false;
269270
bool HasLshlAddU64Inst = false;
270271
bool HasAddSubU64Insts = false;
@@ -1388,6 +1389,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13881389
return HasMinimum3Maximum3F16;
13891390
}
13901391

1392+
bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
1393+
13911394
bool hasTanhInsts() const { return HasTanhInsts; }
13921395

13931396
bool hasAddPC64Inst() const { return GFX1250Insts; }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14068,7 +14068,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
1406814068
case ISD::FMAXIMUMNUM:
1406914069
case AMDGPUISD::FMIN_LEGACY:
1407014070
case AMDGPUISD::FMAX_LEGACY:
14071-
return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
14071+
return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14072+
(VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
1407214073
case ISD::FMINIMUM:
1407314074
case ISD::FMAXIMUM:
1407414075
return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,17 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
144144
def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
145145
} // End SubtargetPredicate = HasVOP3PInsts
146146

147-
let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
147+
let isCommutable = 1, FPDPRounding = 1 in {
148+
let SubtargetPredicate = HasMin3Max3PKF16 in {
149+
defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>;
150+
defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>;
151+
}
152+
153+
let SubtargetPredicate = HasMinimum3Maximum3PKF16 in {
148154
defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>;
149155
defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>;
150156
}
157+
} // End isCommutable = 1, FPDPRounding = 1
151158

152159
// TODO: Make sure we're doing the right thing with denormals. Note
153160
// that FMA and MAD will differ.
@@ -2237,6 +2244,8 @@ defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>;
22372244
defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>;
22382245
defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>;
22392246
defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>;
2247+
defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>;
2248+
defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>;
22402249

22412250
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
22422251
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;

llvm/test/CodeGen/AMDGPU/fmax3.ll

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
77
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
88
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
9+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-TRUE16 %s
10+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-FAKE16 %s
911

1012
define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
1113
; SI-LABEL: test_fmax3_olt_0_f32:
@@ -157,6 +159,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
157159
; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
158160
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
159161
; GFX12-NEXT: s_endpgm
162+
;
163+
; GFX1250-LABEL: test_fmax3_olt_0_f32:
164+
; GFX1250: ; %bb.0:
165+
; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
166+
; GFX1250-NEXT: s_mov_b32 s10, -1
167+
; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
168+
; GFX1250-NEXT: s_mov_b32 s14, s10
169+
; GFX1250-NEXT: s_mov_b32 s15, s11
170+
; GFX1250-NEXT: s_mov_b32 s18, s10
171+
; GFX1250-NEXT: s_mov_b32 s19, s11
172+
; GFX1250-NEXT: s_mov_b32 s22, s10
173+
; GFX1250-NEXT: s_mov_b32 s23, s11
174+
; GFX1250-NEXT: s_wait_kmcnt 0x0
175+
; GFX1250-NEXT: s_mov_b32 s12, s2
176+
; GFX1250-NEXT: s_mov_b32 s13, s3
177+
; GFX1250-NEXT: s_mov_b32 s16, s4
178+
; GFX1250-NEXT: s_mov_b32 s17, s5
179+
; GFX1250-NEXT: s_mov_b32 s20, s6
180+
; GFX1250-NEXT: s_mov_b32 s21, s7
181+
; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
182+
; GFX1250-NEXT: s_wait_loadcnt 0x0
183+
; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
184+
; GFX1250-NEXT: s_wait_loadcnt 0x0
185+
; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
186+
; GFX1250-NEXT: s_wait_loadcnt 0x0
187+
; GFX1250-NEXT: s_mov_b32 s8, s0
188+
; GFX1250-NEXT: s_mov_b32 s9, s1
189+
; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2
190+
; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
191+
; GFX1250-NEXT: s_endpgm
160192
%a = load volatile float, ptr addrspace(1) %aptr, align 4
161193
%b = load volatile float, ptr addrspace(1) %bptr, align 4
162194
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -317,6 +349,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
317349
; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
318350
; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
319351
; GFX12-NEXT: s_endpgm
352+
;
353+
; GFX1250-LABEL: test_fmax3_olt_1_f32:
354+
; GFX1250: ; %bb.0:
355+
; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
356+
; GFX1250-NEXT: s_mov_b32 s10, -1
357+
; GFX1250-NEXT: s_mov_b32 s11, 0x31016000
358+
; GFX1250-NEXT: s_mov_b32 s14, s10
359+
; GFX1250-NEXT: s_mov_b32 s15, s11
360+
; GFX1250-NEXT: s_mov_b32 s18, s10
361+
; GFX1250-NEXT: s_mov_b32 s19, s11
362+
; GFX1250-NEXT: s_mov_b32 s22, s10
363+
; GFX1250-NEXT: s_mov_b32 s23, s11
364+
; GFX1250-NEXT: s_wait_kmcnt 0x0
365+
; GFX1250-NEXT: s_mov_b32 s12, s2
366+
; GFX1250-NEXT: s_mov_b32 s13, s3
367+
; GFX1250-NEXT: s_mov_b32 s16, s4
368+
; GFX1250-NEXT: s_mov_b32 s17, s5
369+
; GFX1250-NEXT: s_mov_b32 s20, s6
370+
; GFX1250-NEXT: s_mov_b32 s21, s7
371+
; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
372+
; GFX1250-NEXT: s_wait_loadcnt 0x0
373+
; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
374+
; GFX1250-NEXT: s_wait_loadcnt 0x0
375+
; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
376+
; GFX1250-NEXT: s_wait_loadcnt 0x0
377+
; GFX1250-NEXT: s_mov_b32 s8, s0
378+
; GFX1250-NEXT: s_mov_b32 s9, s1
379+
; GFX1250-NEXT: v_max3_num_f32 v0, v2, v0, v1
380+
; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null
381+
; GFX1250-NEXT: s_endpgm
320382
%a = load volatile float, ptr addrspace(1) %aptr, align 4
321383
%b = load volatile float, ptr addrspace(1) %bptr, align 4
322384
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -544,6 +606,66 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
544606
; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
545607
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
546608
; GFX12-FAKE16-NEXT: s_endpgm
609+
;
610+
; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16:
611+
; GFX1250-TRUE16: ; %bb.0:
612+
; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
613+
; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
614+
; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
615+
; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
616+
; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
617+
; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
618+
; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
619+
; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
620+
; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
621+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
622+
; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
623+
; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
624+
; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
625+
; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
626+
; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
627+
; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
628+
; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
629+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
630+
; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
631+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
632+
; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
633+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
634+
; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
635+
; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
636+
; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
637+
; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
638+
; GFX1250-TRUE16-NEXT: s_endpgm
639+
;
640+
; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16:
641+
; GFX1250-FAKE16: ; %bb.0:
642+
; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
643+
; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
644+
; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
645+
; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
646+
; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
647+
; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
648+
; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
649+
; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
650+
; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
651+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
652+
; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
653+
; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
654+
; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
655+
; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
656+
; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
657+
; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
658+
; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
659+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
660+
; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
661+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
662+
; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
663+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
664+
; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
665+
; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
666+
; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2
667+
; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
668+
; GFX1250-FAKE16-NEXT: s_endpgm
547669
%a = load volatile half, ptr addrspace(1) %aptr, align 2
548670
%b = load volatile half, ptr addrspace(1) %bptr, align 2
549671
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -772,6 +894,66 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
772894
; GFX12-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1
773895
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
774896
; GFX12-FAKE16-NEXT: s_endpgm
897+
;
898+
; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16:
899+
; GFX1250-TRUE16: ; %bb.0:
900+
; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
901+
; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1
902+
; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
903+
; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10
904+
; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11
905+
; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10
906+
; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11
907+
; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10
908+
; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11
909+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
910+
; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2
911+
; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3
912+
; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4
913+
; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5
914+
; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6
915+
; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7
916+
; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
917+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
918+
; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
919+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
920+
; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
921+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
922+
; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0
923+
; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1
924+
; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
925+
; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
926+
; GFX1250-TRUE16-NEXT: s_endpgm
927+
;
928+
; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16:
929+
; GFX1250-FAKE16: ; %bb.0:
930+
; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
931+
; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1
932+
; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
933+
; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10
934+
; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11
935+
; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10
936+
; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11
937+
; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10
938+
; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11
939+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
940+
; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2
941+
; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3
942+
; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4
943+
; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5
944+
; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6
945+
; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7
946+
; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
947+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
948+
; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
949+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
950+
; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
951+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
952+
; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0
953+
; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1
954+
; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1
955+
; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null
956+
; GFX1250-FAKE16-NEXT: s_endpgm
775957
%a = load volatile half, ptr addrspace(1) %aptr, align 2
776958
%b = load volatile half, ptr addrspace(1) %bptr, align 2
777959
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -850,6 +1032,15 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
8501032
; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
8511033
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
8521034
; GFX12-NEXT: s_setpc_b64 s[30:31]
1035+
;
1036+
; GFX1250-LABEL: no_fmax3_v2f16:
1037+
; GFX1250: ; %bb.0: ; %entry
1038+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
1039+
; GFX1250-NEXT: s_wait_kmcnt 0x0
1040+
; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v1
1041+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
1042+
; GFX1250-NEXT: v_pk_max3_num_f16 v0, v2, v0, v3
1043+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
8531044
entry:
8541045
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
8551046
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)

0 commit comments

Comments
 (0)