diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 0e43345bad6f1..70b032ac83d72 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -85,6 +85,69 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc, return maskVec; } +// Builds the VecShuffleOp for pshuflw and pshufhw x86 builtins. +// +// The vector is split into lanes of 8 word elements (16 bits). The lower or +// upper half of each lane, controlled by `isLow`, is shuffled in the following +// way: The immediate is truncated to 8 bits, separated into 4 2-bit fields. The +// i-th field's value represents the resulting index of the i-th element in the +// half lane after shuffling. The other half of the lane remains unchanged. +static cir::VecShuffleOp emitPshufWord(CIRGenFunction &cgf, + CIRGenBuilderTy &builder, + const mlir::Value vec, + const mlir::Value immediate, + const CallExpr *expr, const bool isLow) { + uint32_t imm = cgf.getZExtIntValueFromConstOp(immediate); + + auto vecTy = cast(vec.getType()); + unsigned numElts = vecTy.getSize(); + + unsigned firstHalfStart = isLow ? 0 : 4; + unsigned secondHalfStart = 4 - firstHalfStart; + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + imm = (imm & 0xff) * 0x01010101; + + int64_t indices[32]; + for (unsigned l = 0; l != numElts; l += 8) { + for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) { + indices[l + i] = l + (imm & 3) + firstHalfStart; + imm >>= 2; + } + for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i) + indices[l + i] = l + i; + } + + return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), vec, + ArrayRef(indices, numElts)); +} + +// Builds the shuffle mask for pshufd and shufpd/shufps x86 builtins. +static llvm::SmallVector +computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec, + uint32_t imm, const bool isShufP) { + auto vecTy = cast(vec.getType()); + unsigned numElts = vecTy.getSize(); + unsigned numLanes = cgf.cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128; + unsigned numLaneElts = numElts / numLanes; + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + imm = (imm & 0xff) * 0x01010101; + + llvm::SmallVector indices(numElts); + for (unsigned l = 0; l != numElts; l += numLaneElts) { + for (unsigned i = 0; i != numLaneElts; ++i) { + uint32_t idx = imm % numLaneElts; + imm /= numLaneElts; + if (isShufP && i >= (numLaneElts / 2)) + idx += numElts; + indices[l + i] = l + idx; + } + } + + return indices; +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { @@ -187,9 +250,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_vec_ext_v4di: { unsigned numElts = cast(ops[0].getType()).getSize(); - uint64_t index = - ops[1].getDefiningOp().getIntValue().getZExtValue(); - + uint64_t index = getZExtIntValueFromConstOp(ops[1]); index &= numElts - 1; cir::ConstantOp indexVal = @@ -547,12 +608,20 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_pblendw256: case X86::BI__builtin_ia32_pblendd128: case X86::BI__builtin_ia32_pblendd256: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_pshuflw: case X86::BI__builtin_ia32_pshuflw256: - case X86::BI__builtin_ia32_pshuflw512: + case X86::BI__builtin_ia32_pshuflw512: { + return emitPshufWord(*this, builder, ops[0], ops[1], expr, true); + } case X86::BI__builtin_ia32_pshufhw: case X86::BI__builtin_ia32_pshufhw256: - case X86::BI__builtin_ia32_pshufhw512: + case X86::BI__builtin_ia32_pshufhw512: { + return emitPshufWord(*this, builder, ops[0], ops[1], expr, false); + } case X86::BI__builtin_ia32_pshufd: case X86::BI__builtin_ia32_pshufd256: case X86::BI__builtin_ia32_pshufd512: @@ -561,13 +630,26 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_vpermilpd256: case X86::BI__builtin_ia32_vpermilps256: case X86::BI__builtin_ia32_vpermilpd512: - case X86::BI__builtin_ia32_vpermilps512: + case X86::BI__builtin_ia32_vpermilps512: { + const uint32_t imm = getSExtIntValueFromConstOp(ops[1]); + const llvm::SmallVector mask = + computeFullLaneShuffleMask(*this, ops[0], imm, false); + + return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], mask); + } case X86::BI__builtin_ia32_shufpd: case X86::BI__builtin_ia32_shufpd256: case X86::BI__builtin_ia32_shufpd512: case X86::BI__builtin_ia32_shufps: case X86::BI__builtin_ia32_shufps256: - case X86::BI__builtin_ia32_shufps512: + case X86::BI__builtin_ia32_shufps512: { + const uint32_t imm = getZExtIntValueFromConstOp(ops[2]); + const llvm::SmallVector mask = + computeFullLaneShuffleMask(*this, ops[0], imm, true); + + return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1], + mask); + } case X86::BI__builtin_ia32_permdi256: case X86::BI__builtin_ia32_permdf256: case X86::BI__builtin_ia32_permdi512: diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index b6926bb88ac85..ada16ff187430 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -203,6 +203,22 @@ class CIRGenFunction : public CIRGenTypeCache { return convertType(getContext().getTypeDeclType(t)); } + /// Get integer from a mlir::Value that is an int constant or a constant op. + static int64_t getSExtIntValueFromConstOp(mlir::Value val) { + auto constOp = val.getDefiningOp(); + assert(constOp && "getIntValueFromConstOp call with non ConstantOp"); + return constOp.getIntValue().getSExtValue(); + } + + /// Get zero-extended integer from a mlir::Value that is an int constant or a + /// constant op. + static int64_t getZExtIntValueFromConstOp(mlir::Value val) { + auto constOp = val.getDefiningOp(); + assert(constOp && + "getZeroExtendedIntValueFromConstOp call with non ConstantOp"); + return constOp.getIntValue().getZExtValue(); + } + /// Return the cir::TypeEvaluationKind of QualType \c type. static cir::TypeEvaluationKind getEvaluationKind(clang::QualType type); @@ -1816,7 +1832,7 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s); - mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e); + mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr); /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is /// nonnull, if 1\p LHS is marked _Nonnull. diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c index 82fa4358dc400..1a589b99e20f5 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c @@ -73,4 +73,28 @@ __m256i test_mm256_undefined_si256(void) { // OGCG-LABEL: test_mm256_undefined_si256 // OGCG: ret <4 x i64> zeroinitializer return _mm256_undefined_si256(); -} \ No newline at end of file +} + +__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) { + // CIR-LABEL: test_mm256_shuffle_pd + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !cir.double> + + // CHECK-LABEL: test_mm256_shuffle_pd + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + + // OGCG-LABEL: test_mm256_shuffle_pd + // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + return _mm256_shuffle_pd(A, B, 0); +} + +__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) { + // CIR-LABEL: test_mm256_shuffle_ps + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<8> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<12> : !s32i] : !cir.vector<8 x !cir.float> + + // CHECK-LABEL: test_mm256_shuffle_ps + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> + + // OGCG-LABEL: test_mm256_shuffle_ps + // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> + return _mm256_shuffle_ps(A, B, 0); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c new file mode 100644 index 0000000000000..b7497c2053b2d --- /dev/null +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c @@ -0,0 +1,53 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror +// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror +// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror +// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror +// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG + +// This test mimics clang/test/CodeGen/X86/avx2-builtins.c, which eventually +// CIR shall be able to support fully. + +#include + +__m256i test_mm256_shufflelo_epi16(__m256i a) { + // CIR-LABEL: _mm256_shufflelo_epi16 + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s16i>) [#cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<9> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i] : !cir.vector<16 x !s16i> + + // LLVM-LABEL: test_mm256_shufflelo_epi16 + // LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> + + // OGCG-LABEL: test_mm256_shufflelo_epi16 + // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> + return _mm256_shufflelo_epi16(a, 83); +} + +__m256i test_mm256_shufflehi_epi16(__m256i a) { + // CIR-LABEL: _mm256_shufflehi_epi16 + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<6> : !s32i, #cir.int<5> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<15> : !s32i, #cir.int<14> : !s32i, #cir.int<14> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !s16i> + + // LLVM-LABEL: test_mm256_shufflehi_epi16 + // LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> + + // OGCG-LABEL: test_mm256_shufflehi_epi16 + // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> + return _mm256_shufflehi_epi16(a, 107); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c index 3522e2c7e50bf..452721271de8c 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c @@ -1,15 +1,32 @@ -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-cir -o %t.cir -Wall -Werror +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-llvm -o %t.ll -Wall -Werror -// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror -// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG // This test mimics clang/test/CodeGen/X86/avx512bw-builtins.c, which eventually // CIR shall be able to support fully. @@ -115,3 +132,27 @@ __mmask32 test_kshiftri_mask32_out_of_range(__mmask32 A) { return _kshiftri_mask32(A, 33); } + +__m512i test_mm512_shufflelo_epi16(__m512i __A) { + // CIR-LABEL: _mm512_shufflelo_epi16 + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<32 x !s16i>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<9> : !s32i, #cir.int<8> : !s32i, #cir.int<8> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<17> : !s32i, #cir.int<17> : !s32i, #cir.int<16> : !s32i, #cir.int<16> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, #cir.int<25> : !s32i, #cir.int<25> : !s32i, #cir.int<24> : !s32i, #cir.int<24> : !s32i, #cir.int<28> : !s32i, #cir.int<29> : !s32i, #cir.int<30> : !s32i, #cir.int<31> : !s32i] : !cir.vector<32 x !s16i> + + // LLVM-LABEL: test_mm512_shufflelo_epi16 + // LLVM: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> + + // OGCG-LABEL: test_mm512_shufflelo_epi16 + // OGCG: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> + return _mm512_shufflelo_epi16(__A, 5); +} + +__m512i test_mm512_shufflehi_epi16(__m512i __A) { + // CIR-LABEL: _mm512_shufflehi_epi16 + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<32 x !s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<13> : !s32i, #cir.int<12> : !s32i, #cir.int<12> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<21> : !s32i, #cir.int<21> : !s32i, #cir.int<20> : !s32i, #cir.int<20> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> : !s32i, #cir.int<29> : !s32i, #cir.int<29> : !s32i, #cir.int<28> : !s32i, #cir.int<28> : !s32i] : !cir.vector<32 x !s16i> + + // LLVM-LABEL: test_mm512_shufflehi_epi16 + // LLVM: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> + + // OGCG-LABEL: test_mm512_shufflehi_epi16 + // OGCG: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> + return _mm512_shufflehi_epi16(__A, 5); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c index dc54a87856a7c..bac01671155f5 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c @@ -77,3 +77,27 @@ __m512i test_mm512_undefined_epi32(void) { // OGCG: ret <8 x i64> zeroinitializer return _mm512_undefined_epi32(); } + +__m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) { + // CIR-LABEL: test_mm512_shuffle_pd + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<3> : !s32i, #cir.int<10> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<6> : !s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !cir.double> + + // LLVM-LABEL: test_mm512_shuffle_pd + // LLVM: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> + + // OGCG-LABEL: test_mm512_shuffle_pd + // OGCG: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> + return _mm512_shuffle_pd(__M, __V, 4); +} + +__m512 test_mm512_shuffle_ps(__m512 __M, __m512 __V) { + // CIR-LABEL: test_mm512_shuffle_ps + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<16> : !s32i, #cir.int<16> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<20> : !s32i, #cir.int<20> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<24> : !s32i, #cir.int<24> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<28> : !s32i, #cir.int<28> : !s32i] : !cir.vector<16 x !cir.float> + + // LLVM-LABEL: test_mm512_shuffle_ps + // LLVM: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> + + // OGCG-LABEL: test_mm512_shuffle_ps + // OGCG: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> + return _mm512_shuffle_ps(__M, __V, 4); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/builtin-x86-pshufd.cpp b/clang/test/CIR/CodeGenBuiltins/X86/builtin-x86-pshufd.cpp new file mode 100644 index 0000000000000..29b71f7877575 --- /dev/null +++ b/clang/test/CIR/CodeGenBuiltins/X86/builtin-x86-pshufd.cpp @@ -0,0 +1,113 @@ +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s + +// Test that __builtin_ia32_pshufd and __builtin_ia32_vpermilp generates correct CIR vec.shuffle operations +// This verifies the fix for SIMD intrinsic support that was previously NYI + +typedef int __v4si __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef double __v2df __attribute__((__vector_size__(16))); +typedef float __v8sf __attribute__((__vector_size__(32))); +typedef double __v4df __attribute__((__vector_size__(32))); +typedef float __v16sf __attribute__((__vector_size__(64))); +typedef double __v8df __attribute__((__vector_size__(64))); + +typedef __v4si __m128i; +typedef __v4sf __m128; +typedef __v2df __m128d; +typedef __v8sf __m256; +typedef __v4df __m256d; +typedef __v16sf __m512; +typedef __v8df __m512d; + +// CHECK-LABEL: @_Z11test_pshufdv +void test_pshufd() { + __m128i vec = {1, 2, 3, 4}; + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !s32i> + __m128i result = __builtin_ia32_pshufd(vec, 0x4E); +} + +// CHECK-LABEL: @_Z19test_different_maskv +void test_different_mask() { + __m128i vec = {10, 20, 30, 40}; + // Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !s32i> + __m128i result = __builtin_ia32_pshufd(vec, 0x1B); +} + +// CHECK-LABEL: @_Z9test_casev +void test_case() { + __m128i p0 = {1, 2, 3, 4}; + + // This reproduces the exact pattern from stb_image.h:2685 that was failing: + // _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); + // Which expands to: __builtin_ia32_pshufd(p0, 0x4e) + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !s32i> + __m128i out_vec = __builtin_ia32_pshufd(p0, 0x4e); +} + +// CHECK-LABEL: @_Z15test_vpermilps4v +void test_vpermilps4() { + __m128 vec = {1.0f, 2.0f, 3.0f, 4.0f}; + // vpermilps with immediate 0x4E = 01001110 = [1,3,2,0] for 4 elements + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !cir.float> + __m128 result = __builtin_ia32_vpermilps(vec, 0x4E); +} + +// CHECK-LABEL: @_Z15test_vpermilpd2v +void test_vpermilpd2() { + __m128d vec = {1.0, 2.0}; + // vpermilpd with immediate 0x1 = 01 = [1,0] for 2 elements + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double> + __m128d result = __builtin_ia32_vpermilpd(vec, 0x1); +} + +// CHECK-LABEL: @_Z17test_vpermilps256v +void test_vpermilps256() { + __m256 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + // vpermilps256 with immediate 0x1B = 00011011 = [3,2,1,0] for each 128-bit lane + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i] : !cir.vector<8 x !cir.float> + __m256 result = __builtin_ia32_vpermilps256(vec, 0x1B); +} + +// CHECK-LABEL: @_Z17test_vpermilpd256v +void test_vpermilpd256() { + __m256d vec = {1.0, 2.0, 3.0, 4.0}; + // vpermilpd256 with immediate 0x5 = 0101 = [1,0,1,0] for 4 elements + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i] : !cir.vector<4 x !cir.double> + __m256d result = __builtin_ia32_vpermilpd256(vec, 0x5); +} + +// CHECK-LABEL: @_Z17test_vpermilps512v +void test_vpermilps512() { + __m512 vec = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}; + // vpermilps512 with immediate 0x4E = 01001110 = [1,3,2,0] for each 128-bit lane + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !cir.float>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !cir.float> + __m512 result = __builtin_ia32_vpermilps512(vec, 0x4E); +} + +// CHECK-LABEL: @_Z17test_vpermilpd512v +void test_vpermilpd512() { + __m512d vec = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + // vpermilpd512 with immediate 0x55 = 01010101 = [1,0,1,0,1,0,1,0] for 8 elements + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i] : !cir.vector<8 x !cir.double> + __m512d result = __builtin_ia32_vpermilpd512(vec, 0x55); +} + +// Test different immediate values +// CHECK-LABEL: @_Z24test_vpermilps_differentv +void test_vpermilps_different() { + __m128 vec = {10.0f, 20.0f, 30.0f, 40.0f}; + // Test different immediate value: 0x1B = 00011011 = [3,2,1,0] reversed + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i] : !cir.vector<4 x !cir.float> + __m128 result = __builtin_ia32_vpermilps(vec, 0x1B); +} + +// CHECK-LABEL: @_Z24test_vpermilpd_differentv +void test_vpermilpd_different() { + __m128d vec = {100.0, 200.0}; + // Test immediate 0x0 = 00 = [0,0] - duplicate first element + // CHECK: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i] : !cir.vector<2 x !cir.double> + __m128d result = __builtin_ia32_vpermilpd(vec, 0x0); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse-builtins.c index c893859b297cc..a2a5b1849d727 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/sse-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/sse-builtins.c @@ -71,3 +71,15 @@ __m128 test_mm_undefined_ps(void) { // OGCG: ret <4 x float> zeroinitializer return _mm_undefined_ps(); } + +__m128 test_mm_shuffle_ps(__m128 A, __m128 B) { + // CIR-LABEL: _mm_shuffle_ps + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i] : !cir.vector<4 x !cir.float> + + // CHECK-LABEL: test_mm_shuffle_ps + // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> + + // OGCG-LABEL: test_mm_shuffle_ps + // OGCG: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> + return _mm_shuffle_ps(A, B, 0); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c index f5e07cdc28ccd..a205600c8c1b5 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c @@ -8,8 +8,11 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse2 -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror // RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG // This test mimics clang/test/CodeGen/X86/sse2-builtins.c, which eventually // CIR shall be able to support fully. @@ -108,3 +111,39 @@ void test_mm_pause(void) { // LLVM: call void @llvm.x86.sse2.pause() // OGCG: call void @llvm.x86.sse2.pause() } + +__m128i test_mm_shufflelo_epi16(__m128i A) { + // CIR-LABEL: _mm_shufflelo_epi16 + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !s16i> + + // LLVM-LABEL: test_mm_shufflelo_epi16 + // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> + + // OGCG-LABEL: test_mm_shufflelo_epi16 + // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> + return _mm_shufflelo_epi16(A, 0); +} + +__m128i test_mm_shufflehi_epi16(__m128i A) { + // CIR-LABEL: _mm_shufflehi_epi16 + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i] : !cir.vector<8 x !s16i> + + // LLVM-LABEL: test_mm_shufflehi_epi16 + // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> + + // OGCG-LABEL: test_mm_shufflehi_epi16 + // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> + return _mm_shufflehi_epi16(A, 0); +} + +__m128d test_mm_shuffle_pd(__m128d A, __m128d B) { + // CIR-LABEL: test_mm_shuffle_pd + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<2 x !cir.double> + + // CHECK-LABEL: test_mm_shuffle_pd + // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> + + // OGCG-LABEL: test_mm_shuffle_pd + // OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> + return _mm_shuffle_pd(A, B, 1); +}