diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index c8c36714909ad..a88e1cc2f5307 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1241,6 +1241,11 @@ let True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus in >; } // End True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus +let True16Predicate = UseRealTrue16Insts in { + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; +} // End OtherPredicates = [UseRealTrue16Insts] + let True16Predicate = UseFakeTrue16Insts in { def : OpSelBinOpClampPat; def : OpSelBinOpClampPat; diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll index b714fda6f1d07..2775de29368fb 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_uaddsat_i8: @@ -36,6 +38,28 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-NEXT: v_min_u16 v0, 0xff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_uaddsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -67,6 +91,20 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_uaddsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -97,6 +135,12 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -136,6 +180,12 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -184,6 +234,13 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -238,6 +295,13 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -275,6 +339,13 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -317,6 +388,14 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result } @@ -365,6 +444,15 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result } @@ -437,6 +525,19 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v8 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v9 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v10 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v11 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v12 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v13 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) ret <8 x i32> %result } @@ -565,6 +666,29 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -610,6 +734,17 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll index 8cc7025d671c4..775602ab80cde 100644 --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_usubsat_i8: @@ -27,13 +28,30 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_usubsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_usubsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_usubsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -60,11 +78,24 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_usubsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_usubsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_usubsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -93,11 +124,23 @@ define i16 @usubsat_as_bithack_i16(i16 %x) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: usubsat_as_bithack_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: usubsat_as_bithack_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: usubsat_as_bithack_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: usubsat_as_bithack_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = xor i16 %x, 32768 %result = and i16 %signsplat, %flipsign @@ -128,11 +171,23 @@ define i16 @usubsat_as_bithack2_i16(i16 %x) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: usubsat_as_bithack2_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: usubsat_as_bithack2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: usubsat_as_bithack2_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: usubsat_as_bithack2_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 %result = and i16 %signsplat, %flipsign @@ -163,11 +218,23 @@ define i16 @usubsat_as_bithack_commute_i16(i16 %x) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: usubsat_as_bithack_commute_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: usubsat_as_bithack_commute_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: usubsat_as_bithack_commute_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: usubsat_as_bithack_commute_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 %result = and i16 %flipsign, %signsplat