From 66b4f7727f9ddb3591a1c6234dc366eced4e9378 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Wed, 30 Oct 2024 11:01:09 +0800 Subject: [PATCH] [LoongArch] Support bswap for LSX/LASX VTs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the LSX/LASX instruction sets seem to not include byteswap functionality, it is actually possible through the low-overhead {,X}VSHUF4I family of instructions, in contrast to the naïvely expanded code sequence which is very inefficient. --- .../LoongArch/LoongArchISelLowering.cpp | 4 ++ .../LoongArch/LoongArchLASXInstrInfo.td | 6 +++ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 6 +++ llvm/test/CodeGen/LoongArch/lasx/bswap.ll | 48 ++----------------- llvm/test/CodeGen/LoongArch/lsx/bswap.ll | 48 ++----------------- 5 files changed, 24 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e2c644a56c95b..6bee00d1ce382 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -269,6 +269,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); } + for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) + setOperationAction(ISD::BSWAP, VT, Legal); for (MVT VT : {MVT::v4i32, MVT::v2i64}) { setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal); @@ -317,6 +319,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); } + for (MVT VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) + setOperationAction(ISD::BSWAP, VT, Legal); for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) { setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index d13cc9af135b5..3e39e2c10a617 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1444,6 +1444,12 @@ def : Pat<(xor (v8i32 LASX256:$xj), (v8i32 (vsplat_uimm_pow2 uimm5:$imm))), def : Pat<(xor (v4i64 LASX256:$xj), (v4i64 (vsplat_uimm_pow2 uimm6:$imm))), (XVBITREVI_D LASX256:$xj, uimm6:$imm)>; +// Vector bswaps +def : Pat<(bswap (v16i16 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b10110001)>; +def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>; +def : Pat<(bswap (v4i64 LASX256:$xj)), + (XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>; + // XVFADD_{S/D} defm : PatXrXrF; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 86aa6dcfd8261..525d2802daa23 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1600,6 +1600,12 @@ def : Pat<(xor (v4i32 LSX128:$vj), (v4i32 (vsplat_uimm_pow2 uimm5:$imm))), def : Pat<(xor (v2i64 LSX128:$vj), (v2i64 (vsplat_uimm_pow2 uimm6:$imm))), (VBITREVI_D LSX128:$vj, uimm6:$imm)>; +// Vector bswaps +def : Pat<(bswap (v8i16 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b10110001)>; +def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>; +def : Pat<(bswap (v2i64 LSX128:$vj)), + (VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>; + // VFADD_{S/D} defm : PatVrVrF; diff --git a/llvm/test/CodeGen/LoongArch/lasx/bswap.ll b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll index 4f6d49c7a79db..1b0132d25ed59 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/bswap.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll @@ -5,9 +5,7 @@ define void @bswap_v16i16(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: bswap_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvsrli.h $xr1, $xr0, 8 -; CHECK-NEXT: xvslli.h $xr0, $xr0, 8 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 177 ; CHECK-NEXT: xvst $xr0, $a1, 0 ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %src @@ -20,18 +18,7 @@ define void @bswap_v8i32(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: bswap_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: lu12i.w $a0, 15 -; CHECK-NEXT: ori $a0, $a0, 3840 -; CHECK-NEXT: xvreplgr2vr.w $xr1, $a0 -; CHECK-NEXT: xvsrli.w $xr2, $xr0, 8 -; CHECK-NEXT: xvand.v $xr2, $xr2, $xr1 -; CHECK-NEXT: xvsrli.w $xr3, $xr0, 24 -; CHECK-NEXT: xvor.v $xr2, $xr2, $xr3 -; CHECK-NEXT: xvand.v $xr1, $xr0, $xr1 -; CHECK-NEXT: xvslli.w $xr1, $xr1, 8 -; CHECK-NEXT: xvslli.w $xr0, $xr0, 24 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 +; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 27 ; CHECK-NEXT: xvst $xr0, $a1, 0 ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %src @@ -44,35 +31,8 @@ define void @bswap_v4i64(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: bswap_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: lu12i.w $a0, 4080 -; CHECK-NEXT: xvreplgr2vr.d $xr1, $a0 -; CHECK-NEXT: xvsrli.d $xr2, $xr0, 24 -; CHECK-NEXT: xvand.v $xr2, $xr2, $xr1 -; CHECK-NEXT: lu12i.w $a0, -4096 -; CHECK-NEXT: lu32i.d $a0, 0 -; CHECK-NEXT: xvreplgr2vr.d $xr3, $a0 -; CHECK-NEXT: xvsrli.d $xr4, $xr0, 8 -; CHECK-NEXT: xvand.v $xr4, $xr4, $xr3 -; CHECK-NEXT: xvor.v $xr2, $xr4, $xr2 -; CHECK-NEXT: lu12i.w $a0, 15 -; CHECK-NEXT: ori $a0, $a0, 3840 -; CHECK-NEXT: xvreplgr2vr.d $xr4, $a0 -; CHECK-NEXT: xvsrli.d $xr5, $xr0, 40 -; CHECK-NEXT: xvand.v $xr5, $xr5, $xr4 -; CHECK-NEXT: xvsrli.d $xr6, $xr0, 56 -; CHECK-NEXT: xvor.v $xr5, $xr5, $xr6 -; CHECK-NEXT: xvor.v $xr2, $xr2, $xr5 -; CHECK-NEXT: xvand.v $xr1, $xr0, $xr1 -; CHECK-NEXT: xvslli.d $xr1, $xr1, 24 -; CHECK-NEXT: xvand.v $xr3, $xr0, $xr3 -; CHECK-NEXT: xvslli.d $xr3, $xr3, 8 -; CHECK-NEXT: xvor.v $xr1, $xr1, $xr3 -; CHECK-NEXT: xvand.v $xr3, $xr0, $xr4 -; CHECK-NEXT: xvslli.d $xr3, $xr3, 40 -; CHECK-NEXT: xvslli.d $xr0, $xr0, 56 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr3 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 +; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 27 +; CHECK-NEXT: xvshuf4i.w $xr0, $xr0, 177 ; CHECK-NEXT: xvst $xr0, $a1, 0 ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %src diff --git a/llvm/test/CodeGen/LoongArch/lsx/bswap.ll b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll index ce7af9d33f150..8172e21eae34d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/bswap.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll @@ -5,9 +5,7 @@ define void @bswap_v8i16(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: bswap_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vsrli.h $vr1, $vr0, 8 -; CHECK-NEXT: vslli.h $vr0, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 177 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %v = load <8 x i16>, ptr %src @@ -20,18 +18,7 @@ define void @bswap_v4i32(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: bswap_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: lu12i.w $a0, 15 -; CHECK-NEXT: ori $a0, $a0, 3840 -; CHECK-NEXT: vreplgr2vr.w $vr1, $a0 -; CHECK-NEXT: vsrli.w $vr2, $vr0, 8 -; CHECK-NEXT: vand.v $vr2, $vr2, $vr1 -; CHECK-NEXT: vsrli.w $vr3, $vr0, 24 -; CHECK-NEXT: vor.v $vr2, $vr2, $vr3 -; CHECK-NEXT: vand.v $vr1, $vr0, $vr1 -; CHECK-NEXT: vslli.w $vr1, $vr1, 8 -; CHECK-NEXT: vslli.w $vr0, $vr0, 24 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr2 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 27 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %v = load <4 x i32>, ptr %src @@ -44,35 +31,8 @@ define void @bswap_v2i64(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: bswap_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: lu12i.w $a0, 4080 -; CHECK-NEXT: vreplgr2vr.d $vr1, $a0 -; CHECK-NEXT: vsrli.d $vr2, $vr0, 24 -; CHECK-NEXT: vand.v $vr2, $vr2, $vr1 -; CHECK-NEXT: lu12i.w $a0, -4096 -; CHECK-NEXT: lu32i.d $a0, 0 -; CHECK-NEXT: vreplgr2vr.d $vr3, $a0 -; CHECK-NEXT: vsrli.d $vr4, $vr0, 8 -; CHECK-NEXT: vand.v $vr4, $vr4, $vr3 -; CHECK-NEXT: vor.v $vr2, $vr4, $vr2 -; CHECK-NEXT: lu12i.w $a0, 15 -; CHECK-NEXT: ori $a0, $a0, 3840 -; CHECK-NEXT: vreplgr2vr.d $vr4, $a0 -; CHECK-NEXT: vsrli.d $vr5, $vr0, 40 -; CHECK-NEXT: vand.v $vr5, $vr5, $vr4 -; CHECK-NEXT: vsrli.d $vr6, $vr0, 56 -; CHECK-NEXT: vor.v $vr5, $vr5, $vr6 -; CHECK-NEXT: vor.v $vr2, $vr2, $vr5 -; CHECK-NEXT: vand.v $vr1, $vr0, $vr1 -; CHECK-NEXT: vslli.d $vr1, $vr1, 24 -; CHECK-NEXT: vand.v $vr3, $vr0, $vr3 -; CHECK-NEXT: vslli.d $vr3, $vr3, 8 -; CHECK-NEXT: vor.v $vr1, $vr1, $vr3 -; CHECK-NEXT: vand.v $vr3, $vr0, $vr4 -; CHECK-NEXT: vslli.d $vr3, $vr3, 40 -; CHECK-NEXT: vslli.d $vr0, $vr0, 56 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr3 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr2 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 27 +; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 177 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %v = load <2 x i64>, ptr %src