From 32bbbb9b28718f839e6005fe591a8727838f3684 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 12 Jan 2025 14:42:34 -0800 Subject: [PATCH 001/102] [CodeGen] Migrate away from PointerUnion::dyn_cast (NFC) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Prototype.P to be nonnull. --- clang/lib/CodeGen/CGCall.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 0fde4d8ee296b..a71af0141709f 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4507,7 +4507,7 @@ void CodeGenFunction::EmitCallArgs( // First, if a prototype was provided, use those argument types. bool IsVariadic = false; if (Prototype.P) { - const auto *MD = Prototype.P.dyn_cast(); + const auto *MD = dyn_cast_if_present(Prototype.P); if (MD) { IsVariadic = MD->isVariadic(); ExplicitCC = getCallingConventionForDecl( From 8a2dad6e6bbd596164a7edf7c5e15e070fcfa375 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Sun, 12 Jan 2025 18:52:20 -0700 Subject: [PATCH 002/102] [DirectX] Implement the resource.store.rawbuffer intrinsic (#121282) This introduces `@llvm.dx.resource.store.rawbuffer` and generalizes the buffer store docs under DirectX/DXILResources. Fixes #106188 --- llvm/docs/DirectX/DXILResources.rst | 114 ++++++++++++-- llvm/include/llvm/IR/IntrinsicsDirectX.td | 4 + llvm/lib/Target/DirectX/DXIL.td | 20 +++ llvm/lib/Target/DirectX/DXILOpLowering.cpp | 82 ++++++---- llvm/test/CodeGen/DirectX/BufferStore-sm61.ll | 126 +++++++++++++++ .../CodeGen/DirectX/RawBufferStore-error64.ll | 20 +++ llvm/test/CodeGen/DirectX/RawBufferStore.ll | 144 ++++++++++++++++++ 7 files changed, 469 insertions(+), 41 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/BufferStore-sm61.ll create mode 100644 llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll create mode 100644 llvm/test/CodeGen/DirectX/RawBufferStore.ll diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst index 857d29e48363b..80e3c2c11153d 100644 --- a/llvm/docs/DirectX/DXILResources.rst +++ b/llvm/docs/DirectX/DXILResources.rst @@ -491,26 +491,28 @@ Examples: i32 %byte_offset, i32 0) -Texture and Typed Buffer Stores -------------------------------- +Stores +------ -*relevant types: Textures and TypedBuffer* +*relevant types: Textures and Buffer* -The `TextureStore`_ and `BufferStore`_ DXIL operations always write all four -32-bit components to a texture or a typed buffer. While both operations include -a mask parameter, it is specified that the mask must cover all components when -used with these types. +The `TextureStore`_, `BufferStore`_, and `RawBufferStore`_ DXIL operations +write four components to a texture or a buffer. These include a mask argument +that is used when fewer than 4 components are written, but notably this only +takes on the contiguous x, xy, xyz, and xyzw values. -The store operations that we define as intrinsics behave similarly, and will -only accept writes to the whole of the contained type. This differs from the -loads above, but this makes sense to do from a semantics preserving point of -view. Thus, texture and buffer stores may only operate on 4-element vectors of -types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and -``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and -``<2 x i64>``. +We define the LLVM store intrinsics to accept vectors when storing multiple +components rather than using `undef` and a mask, but otherwise match the DXIL +ops fairly closely. -.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore .. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore +.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore +.. _RawBufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferstore + +For TypedBuffer, we only need one coordinate, and we must always write a vector +since partial writes aren't possible. Similarly to the load operations +described above, we handle 64-bit types specially and only handle 2-element +vectors rather than 4. Examples: @@ -548,3 +550,85 @@ Examples: target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data) call void @llvm.dx.resource.store.typedbuffer.tdx.Buffer_v2f64_1_0_0t( target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data) + +For RawBuffer, we need two indices and we accept scalars and vectors of 4 or +fewer elements. Note that we do allow vectors of 4 64-bit elements here. + +Examples: + +.. list-table:: ``@llvm.dx.resource.store.rawbuffer`` + :header-rows: 1 + + * - Argument + - + - Type + - Description + * - Return value + - + - ``void`` + - + * - ``%buffer`` + - 0 + - ``target(dx.RawBuffer, ...)`` + - The buffer to store into + * - ``%index`` + - 1 + - ``i32`` + - Index into the buffer + * - ``%offset`` + - 2 + - ``i32`` + - Byte offset into structured buffer elements + * - ``%data`` + - 3 + - Scalar or vector + - The data to store + +Examples: + +.. code-block:: llvm + + ; float + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_f32_1_0_0t.f32( + target("dx.RawBuffer", float, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + + ; float4 + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v4f32_1_0_0t.v4f32( + target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + + ; struct S0 { float4 f; int4 i; } + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data0) + call void @llvm.dx.resource.store.rawbuffer.v4i32( + target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer, + i32 %index, i32 16, <4 x i32> %data1) + + ; struct Q { float4 f; int3 i; } + ; struct R { int z; S x; } + call void @llvm.dx.resource.store.rawbuffer.i32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + %buffer, + i32 %index, i32 0, i32 %data0) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + %buffer, + i32 %index, i32 4, <4 x float> %data1) + call void @llvm.dx.resource.store.rawbuffer.v3f16( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + %buffer, + i32 %index, i32 20, <3 x half> %data2) + + ; byteaddressbuf.Store + call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f64( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x double> %data) + diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index ef48af5b42dbf..2a56ba78ce88e 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -40,6 +40,10 @@ def int_dx_resource_load_rawbuffer : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty], [llvm_any_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; +def int_dx_resource_store_rawbuffer + : DefaultAttrsIntrinsic< + [], [llvm_any_ty, llvm_i32_ty, llvm_i32_ty, llvm_any_ty], + [IntrWriteMem]>; def int_dx_resource_updatecounter : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty], diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 62b5b704e99eb..6fdd83c4dc877 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -909,6 +909,26 @@ def RawBufferLoad : DXILOp<139, rawBufferLoad> { let stages = [Stages]; } +def RawBufferStore : DXILOp<140, rawBufferStore> { + let Doc = "writes to a RWByteAddressBuffer or RWStructuredBuffer"; + // Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask, Alignment + let arguments = [ + HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy, + Int8Ty, Int32Ty + ]; + let result = VoidTy; + let overloads = [ + Overloads, + Overloads + ]; + let stages = [Stages]; +} + def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { let Doc = "signed dot product of 4 x i8 vectors packed into i32, with " "accumulate to i32"; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index f43815bf21166..0c245c1a43d31 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -616,7 +616,10 @@ class OpLowerer { return false; } - [[nodiscard]] bool lowerTypedBufferStore(Function &F) { + [[nodiscard]] bool lowerBufferStore(Function &F, bool IsRaw) { + Triple TT(Triple(M.getTargetTriple())); + VersionTuple DXILVersion = TT.getDXILVersion(); + const DataLayout &DL = F.getDataLayout(); IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int8Ty = IRB.getInt8Ty(); Type *Int32Ty = IRB.getInt32Ty(); @@ -627,51 +630,75 @@ class OpLowerer { Value *Handle = createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType()); Value *Index0 = CI->getArgOperand(1); - Value *Index1 = UndefValue::get(Int32Ty); - // For typed stores, the mask must always cover all four elements. - Constant *Mask = ConstantInt::get(Int8Ty, 0xF); + Value *Index1 = IsRaw ? CI->getArgOperand(2) : UndefValue::get(Int32Ty); + + Value *Data = CI->getArgOperand(IsRaw ? 3 : 2); + Type *DataTy = Data->getType(); + Type *ScalarTy = DataTy->getScalarType(); - Value *Data = CI->getArgOperand(2); - auto *DataTy = dyn_cast(Data->getType()); - if (!DataTy || DataTy->getNumElements() != 4) + uint64_t NumElements = + DL.getTypeSizeInBits(DataTy) / DL.getTypeSizeInBits(ScalarTy); + Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements)); + + // TODO: check that we only have vector or scalar... + if (!IsRaw && NumElements != 4) return make_error( "typedBufferStore data must be a vector of 4 elements", inconvertibleErrorCode()); + else if (NumElements > 4) + return make_error( + "rawBufferStore data must have at most 4 elements", + inconvertibleErrorCode()); - // Since we're post-scalarizer, we likely have a vector that's constructed - // solely for the argument of the store. If so, just use the scalar values - // from before they're inserted into the temporary. std::array DataElements{nullptr, nullptr, nullptr, nullptr}; - auto *IEI = dyn_cast(Data); - while (IEI) { - auto *IndexOp = dyn_cast(IEI->getOperand(2)); - if (!IndexOp) - break; - size_t IndexVal = IndexOp->getZExtValue(); - assert(IndexVal < 4 && "Too many elements for buffer store"); - DataElements[IndexVal] = IEI->getOperand(1); - IEI = dyn_cast(IEI->getOperand(0)); + if (DataTy == ScalarTy) + DataElements[0] = Data; + else { + // Since we're post-scalarizer, if we see a vector here it's likely + // constructed solely for the argument of the store. Just use the scalar + // values from before they're inserted into the temporary. + auto *IEI = dyn_cast(Data); + while (IEI) { + auto *IndexOp = dyn_cast(IEI->getOperand(2)); + if (!IndexOp) + break; + size_t IndexVal = IndexOp->getZExtValue(); + assert(IndexVal < 4 && "Too many elements for buffer store"); + DataElements[IndexVal] = IEI->getOperand(1); + IEI = dyn_cast(IEI->getOperand(0)); + } } // If for some reason we weren't able to forward the arguments from the - // scalarizer artifact, then we need to actually extract elements from the - // vector. - for (int I = 0, E = 4; I != E; ++I) + // scalarizer artifact, then we may need to actually extract elements from + // the vector. + for (int I = 0, E = NumElements; I < E; ++I) if (DataElements[I] == nullptr) DataElements[I] = IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, I)); + // For any elements beyond the length of the vector, fill up with undef. + for (int I = NumElements, E = 4; I < E; ++I) + if (DataElements[I] == nullptr) + DataElements[I] = UndefValue::get(ScalarTy); - std::array Args{ + dxil::OpCode Op = OpCode::BufferStore; + SmallVector Args{ Handle, Index0, Index1, DataElements[0], DataElements[1], DataElements[2], DataElements[3], Mask}; + if (IsRaw && DXILVersion >= VersionTuple(1, 2)) { + Op = OpCode::RawBufferStore; + // RawBufferStore requires the alignment + Args.push_back( + ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value())); + } Expected OpCall = - OpBuilder.tryCreateOp(OpCode::BufferStore, Args, CI->getName()); + OpBuilder.tryCreateOp(Op, Args, CI->getName()); if (Error E = OpCall.takeError()) return E; CI->eraseFromParent(); // Clean up any leftover `insertelement`s - IEI = dyn_cast(Data); + auto *IEI = dyn_cast(Data); while (IEI && IEI->use_empty()) { InsertElementInst *Tmp = IEI; IEI = dyn_cast(IEI->getOperand(0)); @@ -776,11 +803,14 @@ class OpLowerer { HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true); break; case Intrinsic::dx_resource_store_typedbuffer: - HasErrors |= lowerTypedBufferStore(F); + HasErrors |= lowerBufferStore(F, /*IsRaw=*/false); break; case Intrinsic::dx_resource_load_rawbuffer: HasErrors |= lowerRawBufferLoad(F); break; + case Intrinsic::dx_resource_store_rawbuffer: + HasErrors |= lowerBufferStore(F, /*IsRaw=*/true); + break; case Intrinsic::dx_resource_updatecounter: HasErrors |= lowerUpdateCounter(F); break; diff --git a/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll b/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll new file mode 100644 index 0000000000000..1916cdf374455 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/BufferStore-sm61.ll @@ -0,0 +1,126 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s +; Before SM6.2 ByteAddressBuffer and StructuredBuffer lower to bufferStore. + +target triple = "dxil-pc-shadermodel6.1-compute" + +; CHECK-LABEL: define void @storef32_struct +define void @storef32_struct(i32 %index, float %data) { + %buffer = call target("dx.RawBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float %data, float undef, float undef, float undef, i8 1) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", float, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storef32_byte +define void @storef32_byte(i32 %offset, float %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, float %data, float undef, float undef, float undef, i8 1) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_struct +define void @storev4f32_struct(i32 %index, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_byte +define void @storev4f32_byte(i32 %offset, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storeelements +define void @storeelements(i32 %index, <4 x float> %data0, <4 x i32> %data1) { + %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data0, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data0, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data0, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data0, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x i32> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x i32> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x i32> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x i32> %data1, i32 3 + ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 16, i32 [[DATA1_0]], i32 [[DATA1_1]], i32 [[DATA1_2]], i32 [[DATA1_3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4i32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 16, <4 x i32> %data1) + + ret void +} + +; CHECK-LABEL: define void @storenested +define void @storenested(i32 %index, i32 %data0, <4 x float> %data1, <3 x half> %data2) { + %buffer = call + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i32 %data0, i32 undef, i32 undef, i32 undef, i8 1) + call void @llvm.dx.resource.store.rawbuffer.i32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 0, i32 %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x float> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x float> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x float> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x float> %data1, i32 3 + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 4, float [[DATA1_0]], float [[DATA1_1]], float [[DATA1_2]], float [[DATA1_3]], i8 15) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 4, <4 x float> %data1) + + ; CHECK: [[DATA2_0:%.*]] = extractelement <3 x half> %data2, i32 0 + ; CHECK: [[DATA2_1:%.*]] = extractelement <3 x half> %data2, i32 1 + ; CHECK: [[DATA2_2:%.*]] = extractelement <3 x half> %data2, i32 2 + ; CHECK: call void @dx.op.bufferStore.f16(i32 69, %dx.types.Handle %{{.*}}, i32 %index, i32 20, half [[DATA2_0]], half [[DATA2_1]], half [[DATA2_2]], half undef, i8 7) + call void @llvm.dx.resource.store.rawbuffer.v3f16( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 20, <3 x half> %data2) + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll b/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll new file mode 100644 index 0000000000000..a883a0bbc29fd --- /dev/null +++ b/llvm/test/CodeGen/DirectX/RawBufferStore-error64.ll @@ -0,0 +1,20 @@ +; We use llc for this test so that we don't abort after the first error. +; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s + +target triple = "dxil-pc-shadermodel6.2-compute" + +; Can't store 64 bit types directly until SM6.3 (byteaddressbuf.Store) +; CHECK: error: +; CHECK-SAME: in function storev4f64_byte +; CHECK-SAME: Cannot create RawBufferStore operation: Invalid overload type +define void @storev4f64_byte(i32 %offset, <4 x double> %data) "hlsl.export" { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + call void @llvm.dx.resource.store.rawbuffer.v4i64( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x double> %data) + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/RawBufferStore.ll b/llvm/test/CodeGen/DirectX/RawBufferStore.ll new file mode 100644 index 0000000000000..96824d5ee5a4a --- /dev/null +++ b/llvm/test/CodeGen/DirectX/RawBufferStore.ll @@ -0,0 +1,144 @@ +; RUN: opt -S -dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +; CHECK-LABEL: define void @storef32_struct +define void @storef32_struct(i32 %index, float %data) { + %buffer = call target("dx.RawBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float %data, float undef, float undef, float undef, i8 1, i32 4) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", float, 1, 0, 0) %buffer, + i32 %index, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storef32_byte +define void @storef32_byte(i32 %offset, float %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, float %data, float undef, float undef, float undef, i8 1, i32 4) + call void @llvm.dx.resource.store.rawbuffer.f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, float %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_struct +define void @storev4f32_struct(i32 %index, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storev4f32_byte +define void @storev4f32_byte(i32 %offset, <4 x float> %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x float> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x float> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x float> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x float> %data, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, float [[DATA0]], float [[DATA1]], float [[DATA2]], float [[DATA3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x float> %data) + + ret void +} + +; CHECK-LABEL: define void @storeelements +define void @storeelements(i32 %index, <4 x float> %data0, <4 x i32> %data1) { + %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data0, i32 0 + ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data0, i32 1 + ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data0, i32 2 + ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data0, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 0, <4 x float> %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x i32> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x i32> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x i32> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x i32> %data1, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 16, i32 [[DATA1_0]], i32 [[DATA1_1]], i32 [[DATA1_2]], i32 [[DATA1_3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4i32( + target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 1, 0, 0) %buffer, + i32 %index, i32 16, <4 x i32> %data1) + + ret void +} + +; CHECK-LABEL: define void @storenested +define void @storenested(i32 %index, i32 %data0, <4 x float> %data1, <3 x half> %data2) { + %buffer = call + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 0, i32 %data0, i32 undef, i32 undef, i32 undef, i8 1, i32 4) + call void @llvm.dx.resource.store.rawbuffer.i32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 0, i32 %data0) + + ; CHECK: [[DATA1_0:%.*]] = extractelement <4 x float> %data1, i32 0 + ; CHECK: [[DATA1_1:%.*]] = extractelement <4 x float> %data1, i32 1 + ; CHECK: [[DATA1_2:%.*]] = extractelement <4 x float> %data1, i32 2 + ; CHECK: [[DATA1_3:%.*]] = extractelement <4 x float> %data1, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 4, float [[DATA1_0]], float [[DATA1_1]], float [[DATA1_2]], float [[DATA1_3]], i8 15, i32 4) + call void @llvm.dx.resource.store.rawbuffer.v4f32( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 4, <4 x float> %data1) + + ; CHECK: [[DATA2_0:%.*]] = extractelement <3 x half> %data2, i32 0 + ; CHECK: [[DATA2_1:%.*]] = extractelement <3 x half> %data2, i32 1 + ; CHECK: [[DATA2_2:%.*]] = extractelement <3 x half> %data2, i32 2 + ; CHECK: call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %buffer_annot, i32 %index, i32 20, half [[DATA2_0]], half [[DATA2_1]], half [[DATA2_2]], half undef, i8 7, i32 2) + call void @llvm.dx.resource.store.rawbuffer.v3f16( + target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0) %buffer, + i32 %index, i32 20, <3 x half> %data2) + + ret void +} + +; byteaddressbuf.Store +; CHECK-LABEL: define void @storev4f64_byte +define void @storev4f64_byte(i32 %offset, <4 x double> %data) { + %buffer = call target("dx.RawBuffer", i8, 1, 0, 0) + @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = extractelement <4 x double> %data, i32 0 + ; CHECK: [[DATA1:%.*]] = extractelement <4 x double> %data, i32 1 + ; CHECK: [[DATA2:%.*]] = extractelement <4 x double> %data, i32 2 + ; CHECK: [[DATA3:%.*]] = extractelement <4 x double> %data, i32 3 + ; CHECK: call void @dx.op.rawBufferStore.f64(i32 140, %dx.types.Handle %buffer_annot, i32 %offset, i32 0, double [[DATA0]], double [[DATA1]], double [[DATA2]], double [[DATA3]], i8 15, i32 8) + call void @llvm.dx.resource.store.rawbuffer.v4i64( + target("dx.RawBuffer", i8, 1, 0, 0) %buffer, + i32 %offset, i32 0, <4 x double> %data) + + ret void +} From a60a59053f05b3912f22df3e0607195c364c4317 Mon Sep 17 00:00:00 2001 From: Bill Hoffman Date: Sun, 12 Jan 2025 21:20:20 -0500 Subject: [PATCH 003/102] Fix print module manifest file for macos (#122370) This commit fixes -print-library-module-manifest-path on macos. Currently, this only works on linux systems. This is because on macos systems the library and header files are installed in a different location. The module manifest is next to the libraries and the search function was not looking in both places. There is also a test included. --- clang/lib/Driver/Driver.cpp | 5 ++++ ...les-print-library-module-manifest-path.cpp | 26 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 10df730744b08..9a947f32283c3 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6398,6 +6398,11 @@ std::string Driver::GetFilePath(StringRef Name, const ToolChain &TC) const { if (auto P = SearchPaths(TC.getFilePaths())) return *P; + SmallString<128> R2(ResourceDir); + llvm::sys::path::append(R2, "..", "..", Name); + if (llvm::sys::fs::exists(Twine(R2))) + return std::string(R2); + return std::string(Name); } diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp index 3ba2709ad95cc..8d17fe1549e34 100644 --- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp +++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp @@ -18,6 +18,28 @@ // RUN: --target=x86_64-linux-gnu 2>&1 \ // RUN: | FileCheck libcxx.cpp +// for macos there is a different directory structure +// where the library and libc++.modules.json file are in lib +// directly but headers are in clang/ver directory which +// is the resource directory +// RUN: mkdir -p %t/Inputs/usr/lib/clang/20 +// RUN: touch %t/Inputs/usr/lib/libc++.so +// RUN: touch %t/Inputs/usr/lib/libc++.modules.json +// RUN: %clang -print-library-module-manifest-path \ +// RUN: -stdlib=libc++ \ +// RUN: -resource-dir=%t/Inputs/usr/lib/clang/20 \ +// RUN: --target=arm64-apple-darwin24.1.0 2>&1 \ +// RUN: | FileCheck libcxx.cpp.macos + +// RUN: rm %t/Inputs/usr/lib/libc++.so +// RUN: touch %t/Inputs/usr/lib/libc++.a +// RUN: touch %t/Inputs/usr/lib/libc++.modules.json +// RUN: %clang -print-library-module-manifest-path \ +// RUN: -stdlib=libc++ \ +// RUN: -resource-dir=%t/Inputs/usr/lib/clang/20 \ +// RUN: --target=arm64-apple-darwin24.1.0 2>&1 \ +// RUN: | FileCheck libcxx.cpp.macos + // RUN: rm %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so // RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.a // RUN: %clang -print-library-module-manifest-path \ @@ -40,6 +62,10 @@ // CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}libc++.modules.json +//--- libcxx.cpp.macos + +// CHECK: {{.*}}libc++.modules.json + //--- libcxx-no-shared-lib.cpp // Note this might find a different path depending whether search path From 5b11c97a333c7e1ffffc2088f9a32e7d5c9743d0 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Mon, 13 Jan 2025 11:28:24 +0800 Subject: [PATCH 004/102] [RISCV] Rework memcpy test (#120364) Use descriptive names and add more cases. --- llvm/test/CodeGen/RISCV/memcpy.ll | 1180 +++++++++++++++++++++-------- 1 file changed, 878 insertions(+), 302 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll index 1ab3722080f70..ce47476de9ce8 100644 --- a/llvm/test/CodeGen/RISCV/memcpy.ll +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -7,406 +7,935 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST -%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } -@src = external dso_local global %struct.x -@dst = external dso_local global %struct.x +; ---------------------------------------------------------------------- +; Fully unaligned cases -@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 -@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 -@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 -@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 -@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 -@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 -@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 +define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false) + ret void +} -define i32 @t0() { -; RV32-LABEL: t0: +define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) + ret void +} + +define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy2: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a0, %hi(src) -; RV32-NEXT: lw a1, %lo(src)(a0) -; RV32-NEXT: lui a2, %hi(dst) -; RV32-NEXT: addi a0, a0, %lo(src) -; RV32-NEXT: sw a1, %lo(dst)(a2) -; RV32-NEXT: lw a1, 4(a0) -; RV32-NEXT: lh a3, 8(a0) -; RV32-NEXT: lbu a0, 10(a0) -; RV32-NEXT: addi a2, a2, %lo(dst) -; RV32-NEXT: sw a1, 4(a2) -; RV32-NEXT: sh a3, 8(a2) -; RV32-NEXT: sb a0, 10(a2) -; RV32-NEXT: li a0, 0 +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t0: +; RV64-LABEL: unaligned_memcpy2: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a0, %hi(src) -; RV64-NEXT: lui a1, %hi(dst) -; RV64-NEXT: ld a2, %lo(src)(a0) -; RV64-NEXT: addi a0, a0, %lo(src) -; RV64-NEXT: lh a3, 8(a0) -; RV64-NEXT: lbu a0, 10(a0) -; RV64-NEXT: sd a2, %lo(dst)(a1) -; RV64-NEXT: addi a1, a1, %lo(dst) -; RV64-NEXT: sh a3, 8(a1) -; RV64-NEXT: sb a0, 10(a1) -; RV64-NEXT: li a0, 0 +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t0: +; RV32-FAST-LABEL: unaligned_memcpy2: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(src) -; RV32-FAST-NEXT: lw a1, %lo(src)(a0) -; RV32-FAST-NEXT: addi a0, a0, %lo(src) -; RV32-FAST-NEXT: lw a2, 4(a0) -; RV32-FAST-NEXT: lw a0, 7(a0) -; RV32-FAST-NEXT: lui a3, %hi(dst) -; RV32-FAST-NEXT: sw a1, %lo(dst)(a3) -; RV32-FAST-NEXT: addi a1, a3, %lo(dst) -; RV32-FAST-NEXT: sw a0, 7(a1) -; RV32-FAST-NEXT: sw a2, 4(a1) -; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t0: +; RV64-FAST-LABEL: unaligned_memcpy2: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(src) -; RV64-FAST-NEXT: ld a1, %lo(src)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(src) -; RV64-FAST-NEXT: lw a0, 7(a0) -; RV64-FAST-NEXT: lui a2, %hi(dst) -; RV64-FAST-NEXT: sd a1, %lo(dst)(a2) -; RV64-FAST-NEXT: addi a1, a2, %lo(dst) -; RV64-FAST-NEXT: sw a0, 7(a1) -; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false) - ret i32 0 + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + ret void } -define void @t1(ptr nocapture %C) nounwind { -; RV32-LABEL: t1: +define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str1) -; RV32-NEXT: addi a1, a1, %lo(.L.str1) -; RV32-NEXT: li a2, 31 -; RV32-NEXT: tail memcpy +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t1: +; RV64-LABEL: unaligned_memcpy3: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str1) -; RV64-NEXT: addi a1, a1, %lo(.L.str1) -; RV64-NEXT: li a2, 31 -; RV64-NEXT: tail memcpy +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t1: +; RV32-FAST-LABEL: unaligned_memcpy3: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1141 -; RV32-FAST-NEXT: lui a2, 300325 -; RV32-FAST-NEXT: lui a3, 132181 -; RV32-FAST-NEXT: lui a4, 340483 -; RV32-FAST-NEXT: lui a5, 267556 -; RV32-FAST-NEXT: lui a6, 337154 -; RV32-FAST-NEXT: addi a1, a1, -439 -; RV32-FAST-NEXT: sw a1, 27(a0) -; RV32-FAST-NEXT: lui a1, 320757 -; RV32-FAST-NEXT: addi a2, a2, 1107 -; RV32-FAST-NEXT: addi a3, a3, -689 -; RV32-FAST-NEXT: addi a4, a4, -947 -; RV32-FAST-NEXT: sw a4, 16(a0) -; RV32-FAST-NEXT: sw a3, 20(a0) -; RV32-FAST-NEXT: sw a2, 24(a0) -; RV32-FAST-NEXT: lui a2, 365861 -; RV32-FAST-NEXT: addi a3, a5, 1871 -; RV32-FAST-NEXT: addi a4, a6, 69 -; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: addi a2, a2, -1980 -; RV32-FAST-NEXT: sw a2, 0(a0) -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: sw a4, 8(a0) -; RV32-FAST-NEXT: sw a3, 12(a0) +; RV32-FAST-NEXT: lbu a2, 2(a1) +; RV32-FAST-NEXT: sb a2, 2(a0) +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t1: +; RV64-FAST-LABEL: unaligned_memcpy3: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str1) -; RV64-FAST-NEXT: addi a2, a1, %lo(.L.str1) -; RV64-FAST-NEXT: ld a3, 23(a2) -; RV64-FAST-NEXT: ld a1, %lo(.L.str1)(a1) -; RV64-FAST-NEXT: ld a4, 8(a2) -; RV64-FAST-NEXT: ld a2, 16(a2) -; RV64-FAST-NEXT: sd a3, 23(a0) -; RV64-FAST-NEXT: sd a1, 0(a0) -; RV64-FAST-NEXT: sd a4, 8(a0) -; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: lbu a2, 2(a1) +; RV64-FAST-NEXT: sb a2, 2(a0) +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) ret void } -define void @t2(ptr nocapture %C) nounwind { -; RV32-BOTH-LABEL: t2: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lui a1, %hi(.L.str2) -; RV32-BOTH-NEXT: addi a1, a1, %lo(.L.str2) -; RV32-BOTH-NEXT: li a2, 36 -; RV32-BOTH-NEXT: tail memcpy +define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy4: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t2: +; RV64-LABEL: unaligned_memcpy4: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str2) -; RV64-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-NEXT: li a2, 36 -; RV64-NEXT: tail memcpy +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy4: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t2: +; RV64-FAST-LABEL: unaligned_memcpy4: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str2) -; RV64-FAST-NEXT: lui a2, 1156 -; RV64-FAST-NEXT: ld a3, %lo(.L.str2)(a1) -; RV64-FAST-NEXT: addi a2, a2, 332 -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-FAST-NEXT: sw a2, 32(a0) -; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: ld a4, 16(a1) -; RV64-FAST-NEXT: ld a1, 24(a1) -; RV64-FAST-NEXT: sd a3, 0(a0) -; RV64-FAST-NEXT: sd a2, 8(a0) -; RV64-FAST-NEXT: sd a4, 16(a0) -; RV64-FAST-NEXT: sd a1, 24(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false) ret void } -define void @t3(ptr nocapture %C) nounwind { -; RV32-LABEL: t3: +define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy7: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str3) -; RV32-NEXT: addi a1, a1, %lo(.L.str3) -; RV32-NEXT: li a2, 24 -; RV32-NEXT: tail memcpy +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t3: +; RV64-LABEL: unaligned_memcpy7: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str3) -; RV64-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-NEXT: li a2, 24 -; RV64-NEXT: tail memcpy +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t3: +; RV32-FAST-LABEL: unaligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1109 -; RV32-FAST-NEXT: lui a2, 340483 -; RV32-FAST-NEXT: lui a3, 267556 -; RV32-FAST-NEXT: lui a4, 337154 -; RV32-FAST-NEXT: lui a5, 320757 -; RV32-FAST-NEXT: addi a1, a1, -689 -; RV32-FAST-NEXT: addi a2, a2, -947 -; RV32-FAST-NEXT: sw a2, 16(a0) -; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a2, a3, 1871 -; RV32-FAST-NEXT: addi a3, a4, 69 -; RV32-FAST-NEXT: addi a4, a5, 1107 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false) + ret void +} + +define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: sw a4, 4(a0) -; RV32-FAST-NEXT: sw a3, 8(a0) -; RV32-FAST-NEXT: sw a2, 12(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t3: +; RV64-FAST-LABEL: unaligned_memcpy8: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str3) -; RV64-FAST-NEXT: ld a2, %lo(.L.str3)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-FAST-NEXT: ld a3, 8(a1) -; RV64-FAST-NEXT: ld a1, 16(a1) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a3, 8(a0) -; RV64-FAST-NEXT: sd a1, 16(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false) ret void } -define void @t4(ptr nocapture %C) nounwind { -; RV32-LABEL: t4: +define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy15: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str4) -; RV32-NEXT: addi a1, a1, %lo(.L.str4) -; RV32-NEXT: li a2, 18 -; RV32-NEXT: tail memcpy +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lbu a2, 13(a1) +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: lbu a2, 12(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: lbu a2, 11(a1) +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lbu a2, 9(a1) +; RV32-NEXT: sb a2, 9(a0) +; RV32-NEXT: lbu a2, 8(a1) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t4: +; RV64-LABEL: unaligned_memcpy15: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str4) -; RV64-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-NEXT: li a2, 18 -; RV64-NEXT: tail memcpy +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lbu a2, 13(a1) +; RV64-NEXT: sb a2, 13(a0) +; RV64-NEXT: lbu a2, 12(a1) +; RV64-NEXT: sb a2, 12(a0) +; RV64-NEXT: lbu a2, 11(a1) +; RV64-NEXT: sb a2, 11(a0) +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lbu a2, 9(a1) +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: lbu a2, 8(a1) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t4: +; RV32-FAST-LABEL: unaligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: li a1, 32 -; RV32-FAST-NEXT: lui a2, 132388 -; RV32-FAST-NEXT: lui a3, 337154 -; RV32-FAST-NEXT: lui a4, 320757 -; RV32-FAST-NEXT: sh a1, 16(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a2, a2, 1871 -; RV32-FAST-NEXT: addi a3, a3, 69 -; RV32-FAST-NEXT: addi a4, a4, 1107 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: sw a4, 4(a0) -; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy15: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false) + ret void +} + +define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 15(a1) +; RV32-NEXT: sb a2, 15(a0) +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lbu a2, 13(a1) +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: lbu a2, 12(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: lbu a2, 11(a1) +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lbu a2, 9(a1) +; RV32-NEXT: sb a2, 9(a0) +; RV32-NEXT: lbu a2, 8(a1) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 15(a1) +; RV64-NEXT: sb a2, 15(a0) +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lbu a2, 13(a1) +; RV64-NEXT: sb a2, 13(a0) +; RV64-NEXT: lbu a2, 12(a1) +; RV64-NEXT: sb a2, 12(a0) +; RV64-NEXT: lbu a2, 11(a1) +; RV64-NEXT: sb a2, 11(a0) +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lbu a2, 9(a1) +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: lbu a2, 8(a1) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy16: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 12(a1) ; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t4: +; RV64-FAST-LABEL: unaligned_memcpy16: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str4) -; RV64-FAST-NEXT: ld a2, %lo(.L.str4)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: li a3, 32 -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) -; RV64-FAST-NEXT: sh a3, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false) ret void } -define void @t5(ptr nocapture %C) nounwind { -; RV32-LABEL: t5: +define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: li a1, 84 -; RV32-NEXT: li a2, 83 -; RV32-NEXT: li a3, 89 -; RV32-NEXT: li a4, 82 -; RV32-NEXT: li a5, 72 -; RV32-NEXT: li a6, 68 +; RV32-NEXT: lbu a2, 30(a1) +; RV32-NEXT: sb a2, 30(a0) +; RV32-NEXT: lbu a2, 29(a1) +; RV32-NEXT: sb a2, 29(a0) +; RV32-NEXT: lbu a2, 28(a1) +; RV32-NEXT: sb a2, 28(a0) +; RV32-NEXT: lbu a2, 27(a1) +; RV32-NEXT: sb a2, 27(a0) +; RV32-NEXT: lbu a2, 26(a1) +; RV32-NEXT: sb a2, 26(a0) +; RV32-NEXT: lbu a2, 25(a1) +; RV32-NEXT: sb a2, 25(a0) +; RV32-NEXT: lbu a2, 24(a1) +; RV32-NEXT: sb a2, 24(a0) +; RV32-NEXT: lbu a2, 23(a1) +; RV32-NEXT: sb a2, 23(a0) +; RV32-NEXT: lbu a2, 22(a1) +; RV32-NEXT: sb a2, 22(a0) +; RV32-NEXT: lbu a2, 21(a1) +; RV32-NEXT: sb a2, 21(a0) +; RV32-NEXT: lbu a2, 20(a1) +; RV32-NEXT: sb a2, 20(a0) +; RV32-NEXT: lbu a2, 19(a1) +; RV32-NEXT: sb a2, 19(a0) +; RV32-NEXT: lbu a2, 18(a1) +; RV32-NEXT: sb a2, 18(a0) +; RV32-NEXT: lbu a2, 17(a1) +; RV32-NEXT: sb a2, 17(a0) +; RV32-NEXT: lbu a2, 16(a1) +; RV32-NEXT: sb a2, 16(a0) +; RV32-NEXT: lbu a2, 15(a1) +; RV32-NEXT: sb a2, 15(a0) +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lbu a2, 13(a1) +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: lbu a2, 12(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: lbu a2, 11(a1) +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lbu a2, 9(a1) +; RV32-NEXT: sb a2, 9(a0) +; RV32-NEXT: lbu a2, 8(a1) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) ; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: sb a1, 5(a0) -; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: sb a6, 0(a0) -; RV32-NEXT: sb a5, 1(a0) -; RV32-NEXT: sb a4, 2(a0) -; RV32-NEXT: sb a3, 3(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t5: +; RV64-LABEL: unaligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: li a1, 84 -; RV64-NEXT: li a2, 83 -; RV64-NEXT: li a3, 89 -; RV64-NEXT: li a4, 82 -; RV64-NEXT: li a5, 72 -; RV64-NEXT: li a6, 68 +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: lbu a2, 29(a1) +; RV64-NEXT: sb a2, 29(a0) +; RV64-NEXT: lbu a2, 28(a1) +; RV64-NEXT: sb a2, 28(a0) +; RV64-NEXT: lbu a2, 27(a1) +; RV64-NEXT: sb a2, 27(a0) +; RV64-NEXT: lbu a2, 26(a1) +; RV64-NEXT: sb a2, 26(a0) +; RV64-NEXT: lbu a2, 25(a1) +; RV64-NEXT: sb a2, 25(a0) +; RV64-NEXT: lbu a2, 24(a1) +; RV64-NEXT: sb a2, 24(a0) +; RV64-NEXT: lbu a2, 23(a1) +; RV64-NEXT: sb a2, 23(a0) +; RV64-NEXT: lbu a2, 22(a1) +; RV64-NEXT: sb a2, 22(a0) +; RV64-NEXT: lbu a2, 21(a1) +; RV64-NEXT: sb a2, 21(a0) +; RV64-NEXT: lbu a2, 20(a1) +; RV64-NEXT: sb a2, 20(a0) +; RV64-NEXT: lbu a2, 19(a1) +; RV64-NEXT: sb a2, 19(a0) +; RV64-NEXT: lbu a2, 18(a1) +; RV64-NEXT: sb a2, 18(a0) +; RV64-NEXT: lbu a2, 17(a1) +; RV64-NEXT: sb a2, 17(a0) +; RV64-NEXT: lbu a2, 16(a1) +; RV64-NEXT: sb a2, 16(a0) +; RV64-NEXT: lbu a2, 15(a1) +; RV64-NEXT: sb a2, 15(a0) +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lbu a2, 13(a1) +; RV64-NEXT: sb a2, 13(a0) +; RV64-NEXT: lbu a2, 12(a1) +; RV64-NEXT: sb a2, 12(a0) +; RV64-NEXT: lbu a2, 11(a1) +; RV64-NEXT: sb a2, 11(a0) +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lbu a2, 9(a1) +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: lbu a2, 8(a1) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) ; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: sb a1, 5(a0) -; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: sb a6, 0(a0) -; RV64-NEXT: sb a5, 1(a0) -; RV64-NEXT: sb a4, 2(a0) -; RV64-NEXT: sb a3, 3(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy31: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy31: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) + ret void +} + +; ---------------------------------------------------------------------- +; Fully aligned cases + +define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false) + ret void +} + +define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false) + ret void +} + +define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy2: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy2: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false) + ret void +} + +define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy3: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a2, 2(a1) +; RV32-BOTH-NEXT: sb a2, 2(a0) +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy3: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a2, 2(a1) +; RV64-BOTH-NEXT: sb a2, 2(a0) +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false) + ret void +} + +define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy4: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy4: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lw a1, 0(a1) +; RV64-BOTH-NEXT: sw a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false) + ret void +} + +define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy7: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lh a2, 4(a1) +; RV32-NEXT: sh a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: aligned_memcpy7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lh a2, 4(a1) +; RV64-NEXT: sh a2, 4(a0) +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t5: +; RV32-FAST-LABEL: aligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1349 -; RV32-FAST-NEXT: addi a1, a1, 857 -; RV32-FAST-NEXT: sw a1, 3(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t5: +; RV64-FAST-LABEL: aligned_memcpy7: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, 1349 -; RV64-FAST-NEXT: addi a1, a1, 857 -; RV64-FAST-NEXT: sw a1, 3(a0) -; RV64-FAST-NEXT: lui a1, 365861 -; RV64-FAST-NEXT: addi a1, a1, -1980 +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) ; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false) + ret void +} + +define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy8: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy8: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false) + ret void +} + +define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy15: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lh a2, 12(a1) +; RV32-NEXT: sh a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: aligned_memcpy15: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lh a2, 12(a1) +; RV64-NEXT: sh a2, 12(a0) +; RV64-NEXT: lw a2, 8(a1) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: aligned_memcpy15: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: aligned_memcpy15: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false) ret void } -define void @t6() nounwind { -; RV32-LABEL: t6: +define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy16: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 12(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: lw a2, 8(a1) +; RV32-BOTH-NEXT: sw a2, 8(a0) +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy16: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a2, 8(a1) +; RV64-BOTH-NEXT: sd a2, 8(a0) +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false) + ret void +} + +define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a0, %hi(spool.splbuf) -; RV32-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV32-NEXT: lui a1, %hi(.L.str6) -; RV32-NEXT: addi a1, a1, %lo(.L.str6) -; RV32-NEXT: li a2, 14 -; RV32-NEXT: call memcpy -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lbu a2, 30(a1) +; RV32-NEXT: sb a2, 30(a0) +; RV32-NEXT: lh a2, 28(a1) +; RV32-NEXT: sh a2, 28(a0) +; RV32-NEXT: lw a2, 24(a1) +; RV32-NEXT: sw a2, 24(a0) +; RV32-NEXT: lw a2, 20(a1) +; RV32-NEXT: sw a2, 20(a0) +; RV32-NEXT: lw a2, 16(a1) +; RV32-NEXT: sw a2, 16(a0) +; RV32-NEXT: lw a2, 12(a1) +; RV32-NEXT: sw a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t6: +; RV64-LABEL: aligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a0, %hi(spool.splbuf) -; RV64-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV64-NEXT: lui a1, %hi(.L.str6) -; RV64-NEXT: addi a1, a1, %lo(.L.str6) -; RV64-NEXT: li a2, 14 -; RV64-NEXT: call memcpy -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: lh a2, 28(a1) +; RV64-NEXT: sh a2, 28(a0) +; RV64-NEXT: lw a2, 24(a1) +; RV64-NEXT: sw a2, 24(a0) +; RV64-NEXT: ld a2, 16(a1) +; RV64-NEXT: sd a2, 16(a0) +; RV64-NEXT: ld a2, 8(a1) +; RV64-NEXT: sd a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t6: +; RV32-FAST-LABEL: aligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(spool.splbuf) -; RV32-FAST-NEXT: li a1, 88 -; RV32-FAST-NEXT: sh a1, %lo(spool.splbuf+12)(a0) -; RV32-FAST-NEXT: lui a1, 361862 -; RV32-FAST-NEXT: addi a1, a1, -1960 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+8)(a0) -; RV32-FAST-NEXT: lui a1, 362199 -; RV32-FAST-NEXT: addi a1, a1, 559 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+4)(a0) -; RV32-FAST-NEXT: lui a1, 460503 -; RV32-FAST-NEXT: addi a1, a1, 1071 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf)(a0) +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t6: +; RV64-FAST-LABEL: aligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(.L.str6) -; RV64-FAST-NEXT: ld a1, %lo(.L.str6)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(.L.str6) -; RV64-FAST-NEXT: ld a0, 6(a0) -; RV64-FAST-NEXT: lui a2, %hi(spool.splbuf) -; RV64-FAST-NEXT: sd a1, %lo(spool.splbuf)(a2) -; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf+6)(a2) +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) ret void } -%struct.Foo = type { i32, i32, i32, i32 } +; ------------------------------------------------------------------------ +; A few partially aligned cases -define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { -; RV32-BOTH-LABEL: t7: + +define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { +; RV32-BOTH-LABEL: memcpy16_align4: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) ; RV32-BOTH-NEXT: sw a2, 12(a0) @@ -418,7 +947,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; -; RV64-LABEL: t7: +; RV64-LABEL: memcpy16_align4: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lw a2, 12(a1) ; RV64-NEXT: sw a2, 12(a0) @@ -430,7 +959,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; -; RV64-FAST-LABEL: t7: +; RV64-FAST-LABEL: memcpy16_align4: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) ; RV64-FAST-NEXT: sd a2, 8(a0) @@ -438,11 +967,58 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false) + tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false) ret void } +define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { +; RV32-LABEL: memcpy11_align8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lh a2, 8(a1) +; RV32-NEXT: sh a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: memcpy11_align8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lh a2, 8(a1) +; RV64-NEXT: sh a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memcpy11_align8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 7(a1) +; RV32-FAST-NEXT: sw a2, 7(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memcpy11_align8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 7(a1) +; RV64-FAST-NEXT: sw a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: ret +entry: + call void @llvm.memcpy.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false) + ret i32 0 +} + declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV64-BOTH: {{.*}} From db99d5b6de236367a93338bb7bf82d926d8bdc24 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Mon, 13 Jan 2025 11:36:37 +0800 Subject: [PATCH 005/102] Revert "[RISCV] Rework memcpy test" (#122662) Reverts llvm/llvm-project#120364 The test should be updated due to some recent changes. --- llvm/test/CodeGen/RISCV/memcpy.ll | 1180 ++++++++--------------------- 1 file changed, 302 insertions(+), 878 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll index ce47476de9ce8..1ab3722080f70 100644 --- a/llvm/test/CodeGen/RISCV/memcpy.ll +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -7,935 +7,406 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST +%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } -; ---------------------------------------------------------------------- -; Fully unaligned cases +@src = external dso_local global %struct.x +@dst = external dso_local global %struct.x -define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: unaligned_memcpy0: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: unaligned_memcpy0: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false) - ret void -} - -define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: unaligned_memcpy1: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lbu a1, 0(a1) -; RV32-BOTH-NEXT: sb a1, 0(a0) -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: unaligned_memcpy1: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: lbu a1, 0(a1) -; RV64-BOTH-NEXT: sb a1, 0(a0) -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) - ret void -} - -define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy2: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: unaligned_memcpy2: -; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: unaligned_memcpy2: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lh a1, 0(a1) -; RV32-FAST-NEXT: sh a1, 0(a0) -; RV32-FAST-NEXT: ret -; -; RV64-FAST-LABEL: unaligned_memcpy2: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lh a1, 0(a1) -; RV64-FAST-NEXT: sh a1, 0(a0) -; RV64-FAST-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) - ret void -} - -define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy3: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: unaligned_memcpy3: -; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: unaligned_memcpy3: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lbu a2, 2(a1) -; RV32-FAST-NEXT: sb a2, 2(a0) -; RV32-FAST-NEXT: lh a1, 0(a1) -; RV32-FAST-NEXT: sh a1, 0(a0) -; RV32-FAST-NEXT: ret -; -; RV64-FAST-LABEL: unaligned_memcpy3: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lbu a2, 2(a1) -; RV64-FAST-NEXT: sb a2, 2(a0) -; RV64-FAST-NEXT: lh a1, 0(a1) -; RV64-FAST-NEXT: sh a1, 0(a0) -; RV64-FAST-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) - ret void -} - -define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy4: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: unaligned_memcpy4: -; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: unaligned_memcpy4: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a1, 0(a1) -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: ret -; -; RV64-FAST-LABEL: unaligned_memcpy4: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lw a1, 0(a1) -; RV64-FAST-NEXT: sw a1, 0(a0) -; RV64-FAST-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false) - ret void -} +@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 +@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 +@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 +@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 +@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 +@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 +@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 -define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy7: +define i32 @t0() { +; RV32-LABEL: t0: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 6(a1) -; RV32-NEXT: sb a2, 6(a0) -; RV32-NEXT: lbu a2, 5(a1) -; RV32-NEXT: sb a2, 5(a0) -; RV32-NEXT: lbu a2, 4(a1) -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: unaligned_memcpy7: -; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 6(a1) -; RV64-NEXT: sb a2, 6(a0) -; RV64-NEXT: lbu a2, 5(a1) -; RV64-NEXT: sb a2, 5(a0) -; RV64-NEXT: lbu a2, 4(a1) -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: unaligned_memcpy7: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 3(a1) -; RV32-FAST-NEXT: sw a2, 3(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: ret -; -; RV64-FAST-LABEL: unaligned_memcpy7: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lw a2, 3(a1) -; RV64-FAST-NEXT: sw a2, 3(a0) -; RV64-FAST-NEXT: lw a1, 0(a1) -; RV64-FAST-NEXT: sw a1, 0(a0) -; RV64-FAST-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false) - ret void -} - -define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy8: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 7(a1) -; RV32-NEXT: sb a2, 7(a0) -; RV32-NEXT: lbu a2, 6(a1) -; RV32-NEXT: sb a2, 6(a0) -; RV32-NEXT: lbu a2, 5(a1) -; RV32-NEXT: sb a2, 5(a0) -; RV32-NEXT: lbu a2, 4(a1) -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: lui a0, %hi(src) +; RV32-NEXT: lw a1, %lo(src)(a0) +; RV32-NEXT: lui a2, %hi(dst) +; RV32-NEXT: addi a0, a0, %lo(src) +; RV32-NEXT: sw a1, %lo(dst)(a2) +; RV32-NEXT: lw a1, 4(a0) +; RV32-NEXT: lh a3, 8(a0) +; RV32-NEXT: lbu a0, 10(a0) +; RV32-NEXT: addi a2, a2, %lo(dst) +; RV32-NEXT: sw a1, 4(a2) +; RV32-NEXT: sh a3, 8(a2) +; RV32-NEXT: sb a0, 10(a2) +; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret ; -; RV64-LABEL: unaligned_memcpy8: +; RV64-LABEL: t0: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 7(a1) -; RV64-NEXT: sb a2, 7(a0) -; RV64-NEXT: lbu a2, 6(a1) -; RV64-NEXT: sb a2, 6(a0) -; RV64-NEXT: lbu a2, 5(a1) -; RV64-NEXT: sb a2, 5(a0) -; RV64-NEXT: lbu a2, 4(a1) -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: lui a0, %hi(src) +; RV64-NEXT: lui a1, %hi(dst) +; RV64-NEXT: ld a2, %lo(src)(a0) +; RV64-NEXT: addi a0, a0, %lo(src) +; RV64-NEXT: lh a3, 8(a0) +; RV64-NEXT: lbu a0, 10(a0) +; RV64-NEXT: sd a2, %lo(dst)(a1) +; RV64-NEXT: addi a1, a1, %lo(dst) +; RV64-NEXT: sh a3, 8(a1) +; RV64-NEXT: sb a0, 10(a1) +; RV64-NEXT: li a0, 0 ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: unaligned_memcpy8: +; RV32-FAST-LABEL: t0: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) -; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: lui a0, %hi(src) +; RV32-FAST-NEXT: lw a1, %lo(src)(a0) +; RV32-FAST-NEXT: addi a0, a0, %lo(src) +; RV32-FAST-NEXT: lw a2, 4(a0) +; RV32-FAST-NEXT: lw a0, 7(a0) +; RV32-FAST-NEXT: lui a3, %hi(dst) +; RV32-FAST-NEXT: sw a1, %lo(dst)(a3) +; RV32-FAST-NEXT: addi a1, a3, %lo(dst) +; RV32-FAST-NEXT: sw a0, 7(a1) +; RV32-FAST-NEXT: sw a2, 4(a1) +; RV32-FAST-NEXT: li a0, 0 ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: unaligned_memcpy8: +; RV64-FAST-LABEL: t0: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: ld a1, 0(a1) -; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: lui a0, %hi(src) +; RV64-FAST-NEXT: ld a1, %lo(src)(a0) +; RV64-FAST-NEXT: addi a0, a0, %lo(src) +; RV64-FAST-NEXT: lw a0, 7(a0) +; RV64-FAST-NEXT: lui a2, %hi(dst) +; RV64-FAST-NEXT: sd a1, %lo(dst)(a2) +; RV64-FAST-NEXT: addi a1, a2, %lo(dst) +; RV64-FAST-NEXT: sw a0, 7(a1) +; RV64-FAST-NEXT: li a0, 0 ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false) - ret void + call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false) + ret i32 0 } -define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy15: +define void @t1(ptr nocapture %C) nounwind { +; RV32-LABEL: t1: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 14(a1) -; RV32-NEXT: sb a2, 14(a0) -; RV32-NEXT: lbu a2, 13(a1) -; RV32-NEXT: sb a2, 13(a0) -; RV32-NEXT: lbu a2, 12(a1) -; RV32-NEXT: sb a2, 12(a0) -; RV32-NEXT: lbu a2, 11(a1) -; RV32-NEXT: sb a2, 11(a0) -; RV32-NEXT: lbu a2, 10(a1) -; RV32-NEXT: sb a2, 10(a0) -; RV32-NEXT: lbu a2, 9(a1) -; RV32-NEXT: sb a2, 9(a0) -; RV32-NEXT: lbu a2, 8(a1) -; RV32-NEXT: sb a2, 8(a0) -; RV32-NEXT: lbu a2, 7(a1) -; RV32-NEXT: sb a2, 7(a0) -; RV32-NEXT: lbu a2, 6(a1) -; RV32-NEXT: sb a2, 6(a0) -; RV32-NEXT: lbu a2, 5(a1) -; RV32-NEXT: sb a2, 5(a0) -; RV32-NEXT: lbu a2, 4(a1) -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret +; RV32-NEXT: lui a1, %hi(.L.str1) +; RV32-NEXT: addi a1, a1, %lo(.L.str1) +; RV32-NEXT: li a2, 31 +; RV32-NEXT: tail memcpy ; -; RV64-LABEL: unaligned_memcpy15: +; RV64-LABEL: t1: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 14(a1) -; RV64-NEXT: sb a2, 14(a0) -; RV64-NEXT: lbu a2, 13(a1) -; RV64-NEXT: sb a2, 13(a0) -; RV64-NEXT: lbu a2, 12(a1) -; RV64-NEXT: sb a2, 12(a0) -; RV64-NEXT: lbu a2, 11(a1) -; RV64-NEXT: sb a2, 11(a0) -; RV64-NEXT: lbu a2, 10(a1) -; RV64-NEXT: sb a2, 10(a0) -; RV64-NEXT: lbu a2, 9(a1) -; RV64-NEXT: sb a2, 9(a0) -; RV64-NEXT: lbu a2, 8(a1) -; RV64-NEXT: sb a2, 8(a0) -; RV64-NEXT: lbu a2, 7(a1) -; RV64-NEXT: sb a2, 7(a0) -; RV64-NEXT: lbu a2, 6(a1) -; RV64-NEXT: sb a2, 6(a0) -; RV64-NEXT: lbu a2, 5(a1) -; RV64-NEXT: sb a2, 5(a0) -; RV64-NEXT: lbu a2, 4(a1) -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret +; RV64-NEXT: lui a1, %hi(.L.str1) +; RV64-NEXT: addi a1, a1, %lo(.L.str1) +; RV64-NEXT: li a2, 31 +; RV64-NEXT: tail memcpy ; -; RV32-FAST-LABEL: unaligned_memcpy15: +; RV32-FAST-LABEL: t1: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 11(a1) -; RV32-FAST-NEXT: sw a2, 11(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) -; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: lui a1, 1141 +; RV32-FAST-NEXT: lui a2, 300325 +; RV32-FAST-NEXT: lui a3, 132181 +; RV32-FAST-NEXT: lui a4, 340483 +; RV32-FAST-NEXT: lui a5, 267556 +; RV32-FAST-NEXT: lui a6, 337154 +; RV32-FAST-NEXT: addi a1, a1, -439 +; RV32-FAST-NEXT: sw a1, 27(a0) +; RV32-FAST-NEXT: lui a1, 320757 +; RV32-FAST-NEXT: addi a2, a2, 1107 +; RV32-FAST-NEXT: addi a3, a3, -689 +; RV32-FAST-NEXT: addi a4, a4, -947 +; RV32-FAST-NEXT: sw a4, 16(a0) +; RV32-FAST-NEXT: sw a3, 20(a0) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lui a2, 365861 +; RV32-FAST-NEXT: addi a3, a5, 1871 +; RV32-FAST-NEXT: addi a4, a6, 69 +; RV32-FAST-NEXT: addi a1, a1, 1107 +; RV32-FAST-NEXT: addi a2, a2, -1980 +; RV32-FAST-NEXT: sw a2, 0(a0) +; RV32-FAST-NEXT: sw a1, 4(a0) +; RV32-FAST-NEXT: sw a4, 8(a0) +; RV32-FAST-NEXT: sw a3, 12(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: unaligned_memcpy15: +; RV64-FAST-LABEL: t1: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: ld a2, 7(a1) -; RV64-FAST-NEXT: sd a2, 7(a0) -; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: lui a1, %hi(.L.str1) +; RV64-FAST-NEXT: addi a2, a1, %lo(.L.str1) +; RV64-FAST-NEXT: ld a3, 23(a2) +; RV64-FAST-NEXT: ld a1, %lo(.L.str1)(a1) +; RV64-FAST-NEXT: ld a4, 8(a2) +; RV64-FAST-NEXT: ld a2, 16(a2) +; RV64-FAST-NEXT: sd a3, 23(a0) ; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: sd a4, 8(a0) +; RV64-FAST-NEXT: sd a2, 16(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) ret void } -define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy16: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 15(a1) -; RV32-NEXT: sb a2, 15(a0) -; RV32-NEXT: lbu a2, 14(a1) -; RV32-NEXT: sb a2, 14(a0) -; RV32-NEXT: lbu a2, 13(a1) -; RV32-NEXT: sb a2, 13(a0) -; RV32-NEXT: lbu a2, 12(a1) -; RV32-NEXT: sb a2, 12(a0) -; RV32-NEXT: lbu a2, 11(a1) -; RV32-NEXT: sb a2, 11(a0) -; RV32-NEXT: lbu a2, 10(a1) -; RV32-NEXT: sb a2, 10(a0) -; RV32-NEXT: lbu a2, 9(a1) -; RV32-NEXT: sb a2, 9(a0) -; RV32-NEXT: lbu a2, 8(a1) -; RV32-NEXT: sb a2, 8(a0) -; RV32-NEXT: lbu a2, 7(a1) -; RV32-NEXT: sb a2, 7(a0) -; RV32-NEXT: lbu a2, 6(a1) -; RV32-NEXT: sb a2, 6(a0) -; RV32-NEXT: lbu a2, 5(a1) -; RV32-NEXT: sb a2, 5(a0) -; RV32-NEXT: lbu a2, 4(a1) -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret +define void @t2(ptr nocapture %C) nounwind { +; RV32-BOTH-LABEL: t2: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lui a1, %hi(.L.str2) +; RV32-BOTH-NEXT: addi a1, a1, %lo(.L.str2) +; RV32-BOTH-NEXT: li a2, 36 +; RV32-BOTH-NEXT: tail memcpy ; -; RV64-LABEL: unaligned_memcpy16: +; RV64-LABEL: t2: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 15(a1) -; RV64-NEXT: sb a2, 15(a0) -; RV64-NEXT: lbu a2, 14(a1) -; RV64-NEXT: sb a2, 14(a0) -; RV64-NEXT: lbu a2, 13(a1) -; RV64-NEXT: sb a2, 13(a0) -; RV64-NEXT: lbu a2, 12(a1) -; RV64-NEXT: sb a2, 12(a0) -; RV64-NEXT: lbu a2, 11(a1) -; RV64-NEXT: sb a2, 11(a0) -; RV64-NEXT: lbu a2, 10(a1) -; RV64-NEXT: sb a2, 10(a0) -; RV64-NEXT: lbu a2, 9(a1) -; RV64-NEXT: sb a2, 9(a0) -; RV64-NEXT: lbu a2, 8(a1) -; RV64-NEXT: sb a2, 8(a0) -; RV64-NEXT: lbu a2, 7(a1) -; RV64-NEXT: sb a2, 7(a0) -; RV64-NEXT: lbu a2, 6(a1) -; RV64-NEXT: sb a2, 6(a0) -; RV64-NEXT: lbu a2, 5(a1) -; RV64-NEXT: sb a2, 5(a0) -; RV64-NEXT: lbu a2, 4(a1) -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: unaligned_memcpy16: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 12(a1) -; RV32-FAST-NEXT: sw a2, 12(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: ret +; RV64-NEXT: lui a1, %hi(.L.str2) +; RV64-NEXT: addi a1, a1, %lo(.L.str2) +; RV64-NEXT: li a2, 36 +; RV64-NEXT: tail memcpy ; -; RV64-FAST-LABEL: unaligned_memcpy16: +; RV64-FAST-LABEL: t2: ; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a1, %hi(.L.str2) +; RV64-FAST-NEXT: lui a2, 1156 +; RV64-FAST-NEXT: ld a3, %lo(.L.str2)(a1) +; RV64-FAST-NEXT: addi a2, a2, 332 +; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) +; RV64-FAST-NEXT: sw a2, 32(a0) ; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: ld a4, 16(a1) +; RV64-FAST-NEXT: ld a1, 24(a1) +; RV64-FAST-NEXT: sd a3, 0(a0) ; RV64-FAST-NEXT: sd a2, 8(a0) -; RV64-FAST-NEXT: ld a1, 0(a1) -; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: sd a4, 16(a0) +; RV64-FAST-NEXT: sd a1, 24(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) ret void } -define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy31: +define void @t3(ptr nocapture %C) nounwind { +; RV32-LABEL: t3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 30(a1) -; RV32-NEXT: sb a2, 30(a0) -; RV32-NEXT: lbu a2, 29(a1) -; RV32-NEXT: sb a2, 29(a0) -; RV32-NEXT: lbu a2, 28(a1) -; RV32-NEXT: sb a2, 28(a0) -; RV32-NEXT: lbu a2, 27(a1) -; RV32-NEXT: sb a2, 27(a0) -; RV32-NEXT: lbu a2, 26(a1) -; RV32-NEXT: sb a2, 26(a0) -; RV32-NEXT: lbu a2, 25(a1) -; RV32-NEXT: sb a2, 25(a0) -; RV32-NEXT: lbu a2, 24(a1) -; RV32-NEXT: sb a2, 24(a0) -; RV32-NEXT: lbu a2, 23(a1) -; RV32-NEXT: sb a2, 23(a0) -; RV32-NEXT: lbu a2, 22(a1) -; RV32-NEXT: sb a2, 22(a0) -; RV32-NEXT: lbu a2, 21(a1) -; RV32-NEXT: sb a2, 21(a0) -; RV32-NEXT: lbu a2, 20(a1) -; RV32-NEXT: sb a2, 20(a0) -; RV32-NEXT: lbu a2, 19(a1) -; RV32-NEXT: sb a2, 19(a0) -; RV32-NEXT: lbu a2, 18(a1) -; RV32-NEXT: sb a2, 18(a0) -; RV32-NEXT: lbu a2, 17(a1) -; RV32-NEXT: sb a2, 17(a0) -; RV32-NEXT: lbu a2, 16(a1) -; RV32-NEXT: sb a2, 16(a0) -; RV32-NEXT: lbu a2, 15(a1) -; RV32-NEXT: sb a2, 15(a0) -; RV32-NEXT: lbu a2, 14(a1) -; RV32-NEXT: sb a2, 14(a0) -; RV32-NEXT: lbu a2, 13(a1) -; RV32-NEXT: sb a2, 13(a0) -; RV32-NEXT: lbu a2, 12(a1) -; RV32-NEXT: sb a2, 12(a0) -; RV32-NEXT: lbu a2, 11(a1) -; RV32-NEXT: sb a2, 11(a0) -; RV32-NEXT: lbu a2, 10(a1) -; RV32-NEXT: sb a2, 10(a0) -; RV32-NEXT: lbu a2, 9(a1) -; RV32-NEXT: sb a2, 9(a0) -; RV32-NEXT: lbu a2, 8(a1) -; RV32-NEXT: sb a2, 8(a0) -; RV32-NEXT: lbu a2, 7(a1) -; RV32-NEXT: sb a2, 7(a0) -; RV32-NEXT: lbu a2, 6(a1) -; RV32-NEXT: sb a2, 6(a0) -; RV32-NEXT: lbu a2, 5(a1) -; RV32-NEXT: sb a2, 5(a0) -; RV32-NEXT: lbu a2, 4(a1) -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: lbu a2, 3(a1) -; RV32-NEXT: sb a2, 3(a0) -; RV32-NEXT: lbu a2, 2(a1) -; RV32-NEXT: sb a2, 2(a0) -; RV32-NEXT: lbu a2, 1(a1) -; RV32-NEXT: sb a2, 1(a0) -; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret +; RV32-NEXT: lui a1, %hi(.L.str3) +; RV32-NEXT: addi a1, a1, %lo(.L.str3) +; RV32-NEXT: li a2, 24 +; RV32-NEXT: tail memcpy ; -; RV64-LABEL: unaligned_memcpy31: +; RV64-LABEL: t3: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 30(a1) -; RV64-NEXT: sb a2, 30(a0) -; RV64-NEXT: lbu a2, 29(a1) -; RV64-NEXT: sb a2, 29(a0) -; RV64-NEXT: lbu a2, 28(a1) -; RV64-NEXT: sb a2, 28(a0) -; RV64-NEXT: lbu a2, 27(a1) -; RV64-NEXT: sb a2, 27(a0) -; RV64-NEXT: lbu a2, 26(a1) -; RV64-NEXT: sb a2, 26(a0) -; RV64-NEXT: lbu a2, 25(a1) -; RV64-NEXT: sb a2, 25(a0) -; RV64-NEXT: lbu a2, 24(a1) -; RV64-NEXT: sb a2, 24(a0) -; RV64-NEXT: lbu a2, 23(a1) -; RV64-NEXT: sb a2, 23(a0) -; RV64-NEXT: lbu a2, 22(a1) -; RV64-NEXT: sb a2, 22(a0) -; RV64-NEXT: lbu a2, 21(a1) -; RV64-NEXT: sb a2, 21(a0) -; RV64-NEXT: lbu a2, 20(a1) -; RV64-NEXT: sb a2, 20(a0) -; RV64-NEXT: lbu a2, 19(a1) -; RV64-NEXT: sb a2, 19(a0) -; RV64-NEXT: lbu a2, 18(a1) -; RV64-NEXT: sb a2, 18(a0) -; RV64-NEXT: lbu a2, 17(a1) -; RV64-NEXT: sb a2, 17(a0) -; RV64-NEXT: lbu a2, 16(a1) -; RV64-NEXT: sb a2, 16(a0) -; RV64-NEXT: lbu a2, 15(a1) -; RV64-NEXT: sb a2, 15(a0) -; RV64-NEXT: lbu a2, 14(a1) -; RV64-NEXT: sb a2, 14(a0) -; RV64-NEXT: lbu a2, 13(a1) -; RV64-NEXT: sb a2, 13(a0) -; RV64-NEXT: lbu a2, 12(a1) -; RV64-NEXT: sb a2, 12(a0) -; RV64-NEXT: lbu a2, 11(a1) -; RV64-NEXT: sb a2, 11(a0) -; RV64-NEXT: lbu a2, 10(a1) -; RV64-NEXT: sb a2, 10(a0) -; RV64-NEXT: lbu a2, 9(a1) -; RV64-NEXT: sb a2, 9(a0) -; RV64-NEXT: lbu a2, 8(a1) -; RV64-NEXT: sb a2, 8(a0) -; RV64-NEXT: lbu a2, 7(a1) -; RV64-NEXT: sb a2, 7(a0) -; RV64-NEXT: lbu a2, 6(a1) -; RV64-NEXT: sb a2, 6(a0) -; RV64-NEXT: lbu a2, 5(a1) -; RV64-NEXT: sb a2, 5(a0) -; RV64-NEXT: lbu a2, 4(a1) -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: lbu a2, 3(a1) -; RV64-NEXT: sb a2, 3(a0) -; RV64-NEXT: lbu a2, 2(a1) -; RV64-NEXT: sb a2, 2(a0) -; RV64-NEXT: lbu a2, 1(a1) -; RV64-NEXT: sb a2, 1(a0) -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret +; RV64-NEXT: lui a1, %hi(.L.str3) +; RV64-NEXT: addi a1, a1, %lo(.L.str3) +; RV64-NEXT: li a2, 24 +; RV64-NEXT: tail memcpy ; -; RV32-FAST-LABEL: unaligned_memcpy31: +; RV32-FAST-LABEL: t3: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 27(a1) -; RV32-FAST-NEXT: sw a2, 27(a0) -; RV32-FAST-NEXT: lw a2, 24(a1) -; RV32-FAST-NEXT: sw a2, 24(a0) -; RV32-FAST-NEXT: lw a2, 20(a1) -; RV32-FAST-NEXT: sw a2, 20(a0) -; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: lui a1, 1109 +; RV32-FAST-NEXT: lui a2, 340483 +; RV32-FAST-NEXT: lui a3, 267556 +; RV32-FAST-NEXT: lui a4, 337154 +; RV32-FAST-NEXT: lui a5, 320757 +; RV32-FAST-NEXT: addi a1, a1, -689 +; RV32-FAST-NEXT: addi a2, a2, -947 ; RV32-FAST-NEXT: sw a2, 16(a0) -; RV32-FAST-NEXT: lw a2, 12(a1) -; RV32-FAST-NEXT: sw a2, 12(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 20(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a2, a3, 1871 +; RV32-FAST-NEXT: addi a3, a4, 69 +; RV32-FAST-NEXT: addi a4, a5, 1107 +; RV32-FAST-NEXT: addi a1, a1, -1980 ; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a2, 12(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: unaligned_memcpy31: +; RV64-FAST-LABEL: t3: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: ld a2, 23(a1) -; RV64-FAST-NEXT: sd a2, 23(a0) -; RV64-FAST-NEXT: ld a2, 16(a1) -; RV64-FAST-NEXT: sd a2, 16(a0) -; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: sd a2, 8(a0) -; RV64-FAST-NEXT: ld a1, 0(a1) -; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: lui a1, %hi(.L.str3) +; RV64-FAST-NEXT: ld a2, %lo(.L.str3)(a1) +; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str3) +; RV64-FAST-NEXT: ld a3, 8(a1) +; RV64-FAST-NEXT: ld a1, 16(a1) +; RV64-FAST-NEXT: sd a2, 0(a0) +; RV64-FAST-NEXT: sd a3, 8(a0) +; RV64-FAST-NEXT: sd a1, 16(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) - ret void -} - -; ---------------------------------------------------------------------- -; Fully aligned cases - -define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: aligned_memcpy0: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: aligned_memcpy0: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false) - ret void -} - -define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: aligned_memcpy1: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lbu a1, 0(a1) -; RV32-BOTH-NEXT: sb a1, 0(a0) -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: aligned_memcpy1: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: lbu a1, 0(a1) -; RV64-BOTH-NEXT: sb a1, 0(a0) -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false) - ret void -} - -define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: aligned_memcpy2: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lh a1, 0(a1) -; RV32-BOTH-NEXT: sh a1, 0(a0) -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: aligned_memcpy2: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: lh a1, 0(a1) -; RV64-BOTH-NEXT: sh a1, 0(a0) -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) ret void } -define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: aligned_memcpy3: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lbu a2, 2(a1) -; RV32-BOTH-NEXT: sb a2, 2(a0) -; RV32-BOTH-NEXT: lh a1, 0(a1) -; RV32-BOTH-NEXT: sh a1, 0(a0) -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: aligned_memcpy3: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: lbu a2, 2(a1) -; RV64-BOTH-NEXT: sb a2, 2(a0) -; RV64-BOTH-NEXT: lh a1, 0(a1) -; RV64-BOTH-NEXT: sh a1, 0(a0) -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false) - ret void -} - -define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: aligned_memcpy4: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lw a1, 0(a1) -; RV32-BOTH-NEXT: sw a1, 0(a0) -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: aligned_memcpy4: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: lw a1, 0(a1) -; RV64-BOTH-NEXT: sw a1, 0(a0) -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false) - ret void -} - -define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: aligned_memcpy7: +define void @t4(ptr nocapture %C) nounwind { +; RV32-LABEL: t4: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 6(a1) -; RV32-NEXT: sb a2, 6(a0) -; RV32-NEXT: lh a2, 4(a1) -; RV32-NEXT: sh a2, 4(a0) -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: sw a1, 0(a0) -; RV32-NEXT: ret +; RV32-NEXT: lui a1, %hi(.L.str4) +; RV32-NEXT: addi a1, a1, %lo(.L.str4) +; RV32-NEXT: li a2, 18 +; RV32-NEXT: tail memcpy ; -; RV64-LABEL: aligned_memcpy7: +; RV64-LABEL: t4: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 6(a1) -; RV64-NEXT: sb a2, 6(a0) -; RV64-NEXT: lh a2, 4(a1) -; RV64-NEXT: sh a2, 4(a0) -; RV64-NEXT: lw a1, 0(a1) -; RV64-NEXT: sw a1, 0(a0) -; RV64-NEXT: ret +; RV64-NEXT: lui a1, %hi(.L.str4) +; RV64-NEXT: addi a1, a1, %lo(.L.str4) +; RV64-NEXT: li a2, 18 +; RV64-NEXT: tail memcpy ; -; RV32-FAST-LABEL: aligned_memcpy7: +; RV32-FAST-LABEL: t4: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 3(a1) -; RV32-FAST-NEXT: sw a2, 3(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: li a1, 32 +; RV32-FAST-NEXT: lui a2, 132388 +; RV32-FAST-NEXT: lui a3, 337154 +; RV32-FAST-NEXT: lui a4, 320757 +; RV32-FAST-NEXT: sh a1, 16(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a2, a2, 1871 +; RV32-FAST-NEXT: addi a3, a3, 69 +; RV32-FAST-NEXT: addi a4, a4, 1107 +; RV32-FAST-NEXT: addi a1, a1, -1980 ; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a2, 12(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: aligned_memcpy7: +; RV64-FAST-LABEL: t4: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lw a2, 3(a1) -; RV64-FAST-NEXT: sw a2, 3(a0) -; RV64-FAST-NEXT: lw a1, 0(a1) -; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: lui a1, %hi(.L.str4) +; RV64-FAST-NEXT: ld a2, %lo(.L.str4)(a1) +; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str4) +; RV64-FAST-NEXT: ld a1, 8(a1) +; RV64-FAST-NEXT: li a3, 32 +; RV64-FAST-NEXT: sd a2, 0(a0) +; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: sh a3, 16(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false) - ret void -} - -define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: aligned_memcpy8: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lw a2, 4(a1) -; RV32-BOTH-NEXT: sw a2, 4(a0) -; RV32-BOTH-NEXT: lw a1, 0(a1) -; RV32-BOTH-NEXT: sw a1, 0(a0) -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: aligned_memcpy8: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: ld a1, 0(a1) -; RV64-BOTH-NEXT: sd a1, 0(a0) -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false) ret void } -define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: aligned_memcpy15: +define void @t5(ptr nocapture %C) nounwind { +; RV32-LABEL: t5: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 14(a1) -; RV32-NEXT: sb a2, 14(a0) -; RV32-NEXT: lh a2, 12(a1) -; RV32-NEXT: sh a2, 12(a0) -; RV32-NEXT: lw a2, 8(a1) -; RV32-NEXT: sw a2, 8(a0) -; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: li a1, 84 +; RV32-NEXT: li a2, 83 +; RV32-NEXT: li a3, 89 +; RV32-NEXT: li a4, 82 +; RV32-NEXT: li a5, 72 +; RV32-NEXT: li a6, 68 +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: sb a1, 5(a0) +; RV32-NEXT: sb zero, 6(a0) +; RV32-NEXT: sb a6, 0(a0) +; RV32-NEXT: sb a5, 1(a0) +; RV32-NEXT: sb a4, 2(a0) +; RV32-NEXT: sb a3, 3(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: aligned_memcpy15: +; RV64-LABEL: t5: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 14(a1) -; RV64-NEXT: sb a2, 14(a0) -; RV64-NEXT: lh a2, 12(a1) -; RV64-NEXT: sh a2, 12(a0) -; RV64-NEXT: lw a2, 8(a1) -; RV64-NEXT: sw a2, 8(a0) -; RV64-NEXT: ld a1, 0(a1) -; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: li a1, 84 +; RV64-NEXT: li a2, 83 +; RV64-NEXT: li a3, 89 +; RV64-NEXT: li a4, 82 +; RV64-NEXT: li a5, 72 +; RV64-NEXT: li a6, 68 +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: sb a1, 5(a0) +; RV64-NEXT: sb zero, 6(a0) +; RV64-NEXT: sb a6, 0(a0) +; RV64-NEXT: sb a5, 1(a0) +; RV64-NEXT: sb a4, 2(a0) +; RV64-NEXT: sb a3, 3(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: aligned_memcpy15: +; RV32-FAST-LABEL: t5: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 11(a1) -; RV32-FAST-NEXT: sw a2, 11(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: lui a1, 1349 +; RV32-FAST-NEXT: addi a1, a1, 857 +; RV32-FAST-NEXT: sw a1, 3(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a1, a1, -1980 ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: aligned_memcpy15: +; RV64-FAST-LABEL: t5: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: ld a2, 7(a1) -; RV64-FAST-NEXT: sd a2, 7(a0) -; RV64-FAST-NEXT: ld a1, 0(a1) -; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: lui a1, 1349 +; RV64-FAST-NEXT: addi a1, a1, 857 +; RV64-FAST-NEXT: sw a1, 3(a0) +; RV64-FAST-NEXT: lui a1, 365861 +; RV64-FAST-NEXT: addi a1, a1, -1980 +; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false) ret void } -define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { -; RV32-BOTH-LABEL: aligned_memcpy16: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lw a2, 12(a1) -; RV32-BOTH-NEXT: sw a2, 12(a0) -; RV32-BOTH-NEXT: lw a2, 8(a1) -; RV32-BOTH-NEXT: sw a2, 8(a0) -; RV32-BOTH-NEXT: lw a2, 4(a1) -; RV32-BOTH-NEXT: sw a2, 4(a0) -; RV32-BOTH-NEXT: lw a1, 0(a1) -; RV32-BOTH-NEXT: sw a1, 0(a0) -; RV32-BOTH-NEXT: ret -; -; RV64-BOTH-LABEL: aligned_memcpy16: -; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: ld a2, 8(a1) -; RV64-BOTH-NEXT: sd a2, 8(a0) -; RV64-BOTH-NEXT: ld a1, 0(a1) -; RV64-BOTH-NEXT: sd a1, 0(a0) -; RV64-BOTH-NEXT: ret -entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false) - ret void -} - -define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: aligned_memcpy31: +define void @t6() nounwind { +; RV32-LABEL: t6: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 30(a1) -; RV32-NEXT: sb a2, 30(a0) -; RV32-NEXT: lh a2, 28(a1) -; RV32-NEXT: sh a2, 28(a0) -; RV32-NEXT: lw a2, 24(a1) -; RV32-NEXT: sw a2, 24(a0) -; RV32-NEXT: lw a2, 20(a1) -; RV32-NEXT: sw a2, 20(a0) -; RV32-NEXT: lw a2, 16(a1) -; RV32-NEXT: sw a2, 16(a0) -; RV32-NEXT: lw a2, 12(a1) -; RV32-NEXT: sw a2, 12(a0) -; RV32-NEXT: lw a2, 8(a1) -; RV32-NEXT: sw a2, 8(a0) -; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a0, %hi(spool.splbuf) +; RV32-NEXT: addi a0, a0, %lo(spool.splbuf) +; RV32-NEXT: lui a1, %hi(.L.str6) +; RV32-NEXT: addi a1, a1, %lo(.L.str6) +; RV32-NEXT: li a2, 14 +; RV32-NEXT: call memcpy +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: aligned_memcpy31: +; RV64-LABEL: t6: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 30(a1) -; RV64-NEXT: sb a2, 30(a0) -; RV64-NEXT: lh a2, 28(a1) -; RV64-NEXT: sh a2, 28(a0) -; RV64-NEXT: lw a2, 24(a1) -; RV64-NEXT: sw a2, 24(a0) -; RV64-NEXT: ld a2, 16(a1) -; RV64-NEXT: sd a2, 16(a0) -; RV64-NEXT: ld a2, 8(a1) -; RV64-NEXT: sd a2, 8(a0) -; RV64-NEXT: ld a1, 0(a1) -; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: lui a0, %hi(spool.splbuf) +; RV64-NEXT: addi a0, a0, %lo(spool.splbuf) +; RV64-NEXT: lui a1, %hi(.L.str6) +; RV64-NEXT: addi a1, a1, %lo(.L.str6) +; RV64-NEXT: li a2, 14 +; RV64-NEXT: call memcpy +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: aligned_memcpy31: +; RV32-FAST-LABEL: t6: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 27(a1) -; RV32-FAST-NEXT: sw a2, 27(a0) -; RV32-FAST-NEXT: lw a2, 24(a1) -; RV32-FAST-NEXT: sw a2, 24(a0) -; RV32-FAST-NEXT: lw a2, 20(a1) -; RV32-FAST-NEXT: sw a2, 20(a0) -; RV32-FAST-NEXT: lw a2, 16(a1) -; RV32-FAST-NEXT: sw a2, 16(a0) -; RV32-FAST-NEXT: lw a2, 12(a1) -; RV32-FAST-NEXT: sw a2, 12(a0) -; RV32-FAST-NEXT: lw a2, 8(a1) -; RV32-FAST-NEXT: sw a2, 8(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) -; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: lui a0, %hi(spool.splbuf) +; RV32-FAST-NEXT: li a1, 88 +; RV32-FAST-NEXT: sh a1, %lo(spool.splbuf+12)(a0) +; RV32-FAST-NEXT: lui a1, 361862 +; RV32-FAST-NEXT: addi a1, a1, -1960 +; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+8)(a0) +; RV32-FAST-NEXT: lui a1, 362199 +; RV32-FAST-NEXT: addi a1, a1, 559 +; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+4)(a0) +; RV32-FAST-NEXT: lui a1, 460503 +; RV32-FAST-NEXT: addi a1, a1, 1071 +; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf)(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: aligned_memcpy31: +; RV64-FAST-LABEL: t6: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: ld a2, 23(a1) -; RV64-FAST-NEXT: sd a2, 23(a0) -; RV64-FAST-NEXT: ld a2, 16(a1) -; RV64-FAST-NEXT: sd a2, 16(a0) -; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: sd a2, 8(a0) -; RV64-FAST-NEXT: ld a1, 0(a1) -; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: lui a0, %hi(.L.str6) +; RV64-FAST-NEXT: ld a1, %lo(.L.str6)(a0) +; RV64-FAST-NEXT: addi a0, a0, %lo(.L.str6) +; RV64-FAST-NEXT: ld a0, 6(a0) +; RV64-FAST-NEXT: lui a2, %hi(spool.splbuf) +; RV64-FAST-NEXT: sd a1, %lo(spool.splbuf)(a2) +; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf+6)(a2) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) ret void } -; ------------------------------------------------------------------------ -; A few partially aligned cases +%struct.Foo = type { i32, i32, i32, i32 } - -define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { -; RV32-BOTH-LABEL: memcpy16_align4: +define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { +; RV32-BOTH-LABEL: t7: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) ; RV32-BOTH-NEXT: sw a2, 12(a0) @@ -947,7 +418,7 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; -; RV64-LABEL: memcpy16_align4: +; RV64-LABEL: t7: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lw a2, 12(a1) ; RV64-NEXT: sw a2, 12(a0) @@ -959,7 +430,7 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { ; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; -; RV64-FAST-LABEL: memcpy16_align4: +; RV64-FAST-LABEL: t7: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) ; RV64-FAST-NEXT: sd a2, 8(a0) @@ -967,58 +438,11 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false) + tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false) ret void } -define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { -; RV32-LABEL: memcpy11_align8: -; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 10(a1) -; RV32-NEXT: sb a2, 10(a0) -; RV32-NEXT: lh a2, 8(a1) -; RV32-NEXT: sh a2, 8(a0) -; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: sw a1, 0(a0) -; RV32-NEXT: li a0, 0 -; RV32-NEXT: ret -; -; RV64-LABEL: memcpy11_align8: -; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 10(a1) -; RV64-NEXT: sb a2, 10(a0) -; RV64-NEXT: lh a2, 8(a1) -; RV64-NEXT: sh a2, 8(a0) -; RV64-NEXT: ld a1, 0(a1) -; RV64-NEXT: sd a1, 0(a0) -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: memcpy11_align8: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 7(a1) -; RV32-FAST-NEXT: sw a2, 7(a0) -; RV32-FAST-NEXT: lw a2, 4(a1) -; RV32-FAST-NEXT: sw a2, 4(a0) -; RV32-FAST-NEXT: lw a1, 0(a1) -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: li a0, 0 -; RV32-FAST-NEXT: ret -; -; RV64-FAST-LABEL: memcpy11_align8: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lw a2, 7(a1) -; RV64-FAST-NEXT: sw a2, 7(a0) -; RV64-FAST-NEXT: ld a1, 0(a1) -; RV64-FAST-NEXT: sd a1, 0(a0) -; RV64-FAST-NEXT: li a0, 0 -; RV64-FAST-NEXT: ret -entry: - call void @llvm.memcpy.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false) - ret i32 0 -} - declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV64-BOTH: {{.*}} From f70db689d026d8e549719ed00e7298b44f09e229 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Mon, 13 Jan 2025 11:28:24 +0800 Subject: [PATCH 006/102] Reapply "[RISCV] Rework memcpy test (#120364)" Use descriptive names and add more cases. This recommits 59bba39 which was reverted in 4637c77. --- llvm/test/CodeGen/RISCV/memcpy.ll | 913 ++++++++++++++++++++---------- 1 file changed, 615 insertions(+), 298 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll index 1ab3722080f70..447fc26b0106e 100644 --- a/llvm/test/CodeGen/RISCV/memcpy.ll +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -7,406 +7,676 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST -%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } -@src = external dso_local global %struct.x -@dst = external dso_local global %struct.x +; ---------------------------------------------------------------------- +; Fully unaligned cases -@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 -@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 -@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 -@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 -@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 -@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 -@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 +define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false) + ret void +} -define i32 @t0() { -; RV32-LABEL: t0: +define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) + ret void +} + +define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy2: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a0, %hi(src) -; RV32-NEXT: lw a1, %lo(src)(a0) -; RV32-NEXT: lui a2, %hi(dst) -; RV32-NEXT: addi a0, a0, %lo(src) -; RV32-NEXT: sw a1, %lo(dst)(a2) -; RV32-NEXT: lw a1, 4(a0) -; RV32-NEXT: lh a3, 8(a0) -; RV32-NEXT: lbu a0, 10(a0) -; RV32-NEXT: addi a2, a2, %lo(dst) -; RV32-NEXT: sw a1, 4(a2) -; RV32-NEXT: sh a3, 8(a2) -; RV32-NEXT: sb a0, 10(a2) -; RV32-NEXT: li a0, 0 +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t0: +; RV64-LABEL: unaligned_memcpy2: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a0, %hi(src) -; RV64-NEXT: lui a1, %hi(dst) -; RV64-NEXT: ld a2, %lo(src)(a0) -; RV64-NEXT: addi a0, a0, %lo(src) -; RV64-NEXT: lh a3, 8(a0) -; RV64-NEXT: lbu a0, 10(a0) -; RV64-NEXT: sd a2, %lo(dst)(a1) -; RV64-NEXT: addi a1, a1, %lo(dst) -; RV64-NEXT: sh a3, 8(a1) -; RV64-NEXT: sb a0, 10(a1) -; RV64-NEXT: li a0, 0 +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t0: +; RV32-FAST-LABEL: unaligned_memcpy2: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(src) -; RV32-FAST-NEXT: lw a1, %lo(src)(a0) -; RV32-FAST-NEXT: addi a0, a0, %lo(src) -; RV32-FAST-NEXT: lw a2, 4(a0) -; RV32-FAST-NEXT: lw a0, 7(a0) -; RV32-FAST-NEXT: lui a3, %hi(dst) -; RV32-FAST-NEXT: sw a1, %lo(dst)(a3) -; RV32-FAST-NEXT: addi a1, a3, %lo(dst) -; RV32-FAST-NEXT: sw a0, 7(a1) -; RV32-FAST-NEXT: sw a2, 4(a1) -; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t0: +; RV64-FAST-LABEL: unaligned_memcpy2: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(src) -; RV64-FAST-NEXT: ld a1, %lo(src)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(src) -; RV64-FAST-NEXT: lw a0, 7(a0) -; RV64-FAST-NEXT: lui a2, %hi(dst) -; RV64-FAST-NEXT: sd a1, %lo(dst)(a2) -; RV64-FAST-NEXT: addi a1, a2, %lo(dst) -; RV64-FAST-NEXT: sw a0, 7(a1) -; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false) - ret i32 0 + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + ret void } -define void @t1(ptr nocapture %C) nounwind { -; RV32-LABEL: t1: +define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str1) -; RV32-NEXT: addi a1, a1, %lo(.L.str1) -; RV32-NEXT: li a2, 31 +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy3: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lbu a2, 2(a1) +; RV32-FAST-NEXT: sb a2, 2(a0) +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy3: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lbu a2, 2(a1) +; RV64-FAST-NEXT: sb a2, 2(a0) +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + ret void +} + +define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy4: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy4: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy4: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy4: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false) + ret void +} + +define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy7: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy7: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false) + ret void +} + +define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false) + ret void +} + +define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy15: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 15 ; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t1: +; RV64-LABEL: unaligned_memcpy15: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str1) -; RV64-NEXT: addi a1, a1, %lo(.L.str1) -; RV64-NEXT: li a2, 31 +; RV64-NEXT: li a2, 15 ; RV64-NEXT: tail memcpy ; -; RV32-FAST-LABEL: t1: +; RV32-FAST-LABEL: unaligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1141 -; RV32-FAST-NEXT: lui a2, 300325 -; RV32-FAST-NEXT: lui a3, 132181 -; RV32-FAST-NEXT: lui a4, 340483 -; RV32-FAST-NEXT: lui a5, 267556 -; RV32-FAST-NEXT: lui a6, 337154 -; RV32-FAST-NEXT: addi a1, a1, -439 -; RV32-FAST-NEXT: sw a1, 27(a0) -; RV32-FAST-NEXT: lui a1, 320757 -; RV32-FAST-NEXT: addi a2, a2, 1107 -; RV32-FAST-NEXT: addi a3, a3, -689 -; RV32-FAST-NEXT: addi a4, a4, -947 -; RV32-FAST-NEXT: sw a4, 16(a0) -; RV32-FAST-NEXT: sw a3, 20(a0) -; RV32-FAST-NEXT: sw a2, 24(a0) -; RV32-FAST-NEXT: lui a2, 365861 -; RV32-FAST-NEXT: addi a3, a5, 1871 -; RV32-FAST-NEXT: addi a4, a6, 69 -; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: addi a2, a2, -1980 -; RV32-FAST-NEXT: sw a2, 0(a0) -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: sw a4, 8(a0) -; RV32-FAST-NEXT: sw a3, 12(a0) +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t1: +; RV64-FAST-LABEL: unaligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str1) -; RV64-FAST-NEXT: addi a2, a1, %lo(.L.str1) -; RV64-FAST-NEXT: ld a3, 23(a2) -; RV64-FAST-NEXT: ld a1, %lo(.L.str1)(a1) -; RV64-FAST-NEXT: ld a4, 8(a2) -; RV64-FAST-NEXT: ld a2, 16(a2) -; RV64-FAST-NEXT: sd a3, 23(a0) +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) ; RV64-FAST-NEXT: sd a1, 0(a0) -; RV64-FAST-NEXT: sd a4, 8(a0) -; RV64-FAST-NEXT: sd a2, 16(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false) ret void } -define void @t2(ptr nocapture %C) nounwind { -; RV32-BOTH-LABEL: t2: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lui a1, %hi(.L.str2) -; RV32-BOTH-NEXT: addi a1, a1, %lo(.L.str2) -; RV32-BOTH-NEXT: li a2, 36 -; RV32-BOTH-NEXT: tail memcpy +define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 16 +; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t2: +; RV64-LABEL: unaligned_memcpy16: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str2) -; RV64-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-NEXT: li a2, 36 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: tail memcpy ; -; RV64-FAST-LABEL: t2: +; RV32-FAST-LABEL: unaligned_memcpy16: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy16: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str2) -; RV64-FAST-NEXT: lui a2, 1156 -; RV64-FAST-NEXT: ld a3, %lo(.L.str2)(a1) -; RV64-FAST-NEXT: addi a2, a2, 332 -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-FAST-NEXT: sw a2, 32(a0) ; RV64-FAST-NEXT: ld a2, 8(a1) -; RV64-FAST-NEXT: ld a4, 16(a1) -; RV64-FAST-NEXT: ld a1, 24(a1) -; RV64-FAST-NEXT: sd a3, 0(a0) ; RV64-FAST-NEXT: sd a2, 8(a0) -; RV64-FAST-NEXT: sd a4, 16(a0) -; RV64-FAST-NEXT: sd a1, 24(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false) ret void } -define void @t3(ptr nocapture %C) nounwind { -; RV32-LABEL: t3: +define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str3) -; RV32-NEXT: addi a1, a1, %lo(.L.str3) -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 31 ; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t3: +; RV64-LABEL: unaligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str3) -; RV64-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-NEXT: li a2, 24 +; RV64-NEXT: li a2, 31 ; RV64-NEXT: tail memcpy ; -; RV32-FAST-LABEL: t3: +; RV32-FAST-LABEL: unaligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1109 -; RV32-FAST-NEXT: lui a2, 340483 -; RV32-FAST-NEXT: lui a3, 267556 -; RV32-FAST-NEXT: lui a4, 337154 -; RV32-FAST-NEXT: lui a5, 320757 -; RV32-FAST-NEXT: addi a1, a1, -689 -; RV32-FAST-NEXT: addi a2, a2, -947 +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) ; RV32-FAST-NEXT: sw a2, 16(a0) -; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a2, a3, 1871 -; RV32-FAST-NEXT: addi a3, a4, 69 -; RV32-FAST-NEXT: addi a4, a5, 1107 -; RV32-FAST-NEXT: addi a1, a1, -1980 -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: sw a4, 4(a0) -; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) ; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t3: +; RV64-FAST-LABEL: unaligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str3) -; RV64-FAST-NEXT: ld a2, %lo(.L.str3)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-FAST-NEXT: ld a3, 8(a1) -; RV64-FAST-NEXT: ld a1, 16(a1) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a3, 8(a0) -; RV64-FAST-NEXT: sd a1, 16(a0) +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) + ret void +} + +; ---------------------------------------------------------------------- +; Fully aligned cases + +define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false) + ret void +} + +define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false) + ret void +} + +define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy2: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy2: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false) + ret void +} + +define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy3: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a2, 2(a1) +; RV32-BOTH-NEXT: sb a2, 2(a0) +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy3: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a2, 2(a1) +; RV64-BOTH-NEXT: sb a2, 2(a0) +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false) ret void } -define void @t4(ptr nocapture %C) nounwind { -; RV32-LABEL: t4: +define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy4: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy4: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lw a1, 0(a1) +; RV64-BOTH-NEXT: sw a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false) + ret void +} + +define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy7: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str4) -; RV32-NEXT: addi a1, a1, %lo(.L.str4) -; RV32-NEXT: li a2, 18 -; RV32-NEXT: tail memcpy +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lh a2, 4(a1) +; RV32-NEXT: sh a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t4: +; RV64-LABEL: aligned_memcpy7: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str4) -; RV64-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-NEXT: li a2, 18 -; RV64-NEXT: tail memcpy +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lh a2, 4(a1) +; RV64-NEXT: sh a2, 4(a0) +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t4: +; RV32-FAST-LABEL: aligned_memcpy7: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: li a1, 32 -; RV32-FAST-NEXT: lui a2, 132388 -; RV32-FAST-NEXT: lui a3, 337154 -; RV32-FAST-NEXT: lui a4, 320757 -; RV32-FAST-NEXT: sh a1, 16(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a2, a2, 1871 -; RV32-FAST-NEXT: addi a3, a3, 69 -; RV32-FAST-NEXT: addi a4, a4, 1107 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: sw a4, 4(a0) -; RV32-FAST-NEXT: sw a3, 8(a0) -; RV32-FAST-NEXT: sw a2, 12(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t4: +; RV64-FAST-LABEL: aligned_memcpy7: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str4) -; RV64-FAST-NEXT: ld a2, %lo(.L.str4)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: li a3, 32 -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) -; RV64-FAST-NEXT: sh a3, 16(a0) +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false) + ret void +} + +define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy8: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy8: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false) ret void } -define void @t5(ptr nocapture %C) nounwind { -; RV32-LABEL: t5: +define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy15: ; RV32: # %bb.0: # %entry -; RV32-NEXT: li a1, 84 -; RV32-NEXT: li a2, 83 -; RV32-NEXT: li a3, 89 -; RV32-NEXT: li a4, 82 -; RV32-NEXT: li a5, 72 -; RV32-NEXT: li a6, 68 -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: sb a1, 5(a0) -; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: sb a6, 0(a0) -; RV32-NEXT: sb a5, 1(a0) -; RV32-NEXT: sb a4, 2(a0) -; RV32-NEXT: sb a3, 3(a0) +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lh a2, 12(a1) +; RV32-NEXT: sh a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t5: +; RV64-LABEL: aligned_memcpy15: ; RV64: # %bb.0: # %entry -; RV64-NEXT: li a1, 84 -; RV64-NEXT: li a2, 83 -; RV64-NEXT: li a3, 89 -; RV64-NEXT: li a4, 82 -; RV64-NEXT: li a5, 72 -; RV64-NEXT: li a6, 68 -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: sb a1, 5(a0) -; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: sb a6, 0(a0) -; RV64-NEXT: sb a5, 1(a0) -; RV64-NEXT: sb a4, 2(a0) -; RV64-NEXT: sb a3, 3(a0) +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lh a2, 12(a1) +; RV64-NEXT: sh a2, 12(a0) +; RV64-NEXT: lw a2, 8(a1) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t5: +; RV32-FAST-LABEL: aligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1349 -; RV32-FAST-NEXT: addi a1, a1, 857 -; RV32-FAST-NEXT: sw a1, 3(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t5: +; RV64-FAST-LABEL: aligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, 1349 -; RV64-FAST-NEXT: addi a1, a1, 857 -; RV64-FAST-NEXT: sw a1, 3(a0) -; RV64-FAST-NEXT: lui a1, 365861 -; RV64-FAST-NEXT: addi a1, a1, -1980 -; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false) ret void } -define void @t6() nounwind { -; RV32-LABEL: t6: +define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy16: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 12(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: lw a2, 8(a1) +; RV32-BOTH-NEXT: sw a2, 8(a0) +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy16: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a2, 8(a1) +; RV64-BOTH-NEXT: sd a2, 8(a0) +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false) + ret void +} + +define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a0, %hi(spool.splbuf) -; RV32-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV32-NEXT: lui a1, %hi(.L.str6) -; RV32-NEXT: addi a1, a1, %lo(.L.str6) -; RV32-NEXT: li a2, 14 -; RV32-NEXT: call memcpy -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32-NEXT: li a2, 31 +; RV32-NEXT: tail memcpy ; -; RV64-LABEL: t6: +; RV64-LABEL: aligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a0, %hi(spool.splbuf) -; RV64-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV64-NEXT: lui a1, %hi(.L.str6) -; RV64-NEXT: addi a1, a1, %lo(.L.str6) -; RV64-NEXT: li a2, 14 -; RV64-NEXT: call memcpy -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: lh a2, 28(a1) +; RV64-NEXT: sh a2, 28(a0) +; RV64-NEXT: lw a2, 24(a1) +; RV64-NEXT: sw a2, 24(a0) +; RV64-NEXT: ld a2, 16(a1) +; RV64-NEXT: sd a2, 16(a0) +; RV64-NEXT: ld a2, 8(a1) +; RV64-NEXT: sd a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t6: +; RV32-FAST-LABEL: aligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(spool.splbuf) -; RV32-FAST-NEXT: li a1, 88 -; RV32-FAST-NEXT: sh a1, %lo(spool.splbuf+12)(a0) -; RV32-FAST-NEXT: lui a1, 361862 -; RV32-FAST-NEXT: addi a1, a1, -1960 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+8)(a0) -; RV32-FAST-NEXT: lui a1, 362199 -; RV32-FAST-NEXT: addi a1, a1, 559 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+4)(a0) -; RV32-FAST-NEXT: lui a1, 460503 -; RV32-FAST-NEXT: addi a1, a1, 1071 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf)(a0) +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t6: +; RV64-FAST-LABEL: aligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(.L.str6) -; RV64-FAST-NEXT: ld a1, %lo(.L.str6)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(.L.str6) -; RV64-FAST-NEXT: ld a0, 6(a0) -; RV64-FAST-NEXT: lui a2, %hi(spool.splbuf) -; RV64-FAST-NEXT: sd a1, %lo(spool.splbuf)(a2) -; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf+6)(a2) +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) + tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) ret void } -%struct.Foo = type { i32, i32, i32, i32 } +; ------------------------------------------------------------------------ +; A few partially aligned cases + -define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { -; RV32-BOTH-LABEL: t7: +define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { +; RV32-BOTH-LABEL: memcpy16_align4: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) ; RV32-BOTH-NEXT: sw a2, 12(a0) @@ -418,7 +688,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; -; RV64-LABEL: t7: +; RV64-LABEL: memcpy16_align4: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lw a2, 12(a1) ; RV64-NEXT: sw a2, 12(a0) @@ -430,7 +700,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; -; RV64-FAST-LABEL: t7: +; RV64-FAST-LABEL: memcpy16_align4: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) ; RV64-FAST-NEXT: sd a2, 8(a0) @@ -438,11 +708,58 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false) + tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false) ret void } +define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { +; RV32-LABEL: memcpy11_align8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lh a2, 8(a1) +; RV32-NEXT: sh a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: memcpy11_align8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lh a2, 8(a1) +; RV64-NEXT: sh a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memcpy11_align8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 7(a1) +; RV32-FAST-NEXT: sw a2, 7(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memcpy11_align8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 7(a1) +; RV64-FAST-NEXT: sw a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: ret +entry: + call void @llvm.memcpy.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false) + ret i32 0 +} + declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV64-BOTH: {{.*}} From 94070a52093dcd86cd6f21d182f6ce9f52c3258b Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Mon, 13 Jan 2025 09:54:57 +0530 Subject: [PATCH 007/102] [SPIRV] convergence anchor intrinsic does not have a parent token (#122230) --- llvm/include/llvm/IR/IntrinsicInst.h | 6 +++--- .../SPIRVConvergenceRegionAnalysis.cpp | 20 ++++++------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 3436216d478e3..6ccbb6b185c7d 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1873,13 +1873,13 @@ class ConvergenceControlInst : public IntrinsicInst { return isa(V) && classof(cast(V)); } - bool isAnchor() { + bool isAnchor() const { return getIntrinsicID() == Intrinsic::experimental_convergence_anchor; } - bool isEntry() { + bool isEntry() const { return getIntrinsicID() == Intrinsic::experimental_convergence_entry; } - bool isLoop() { + bool isLoop() const { return getIntrinsicID() == Intrinsic::experimental_convergence_loop; } }; diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp index cc6daf7ef3442..c23a6c3e8bbe8 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp @@ -56,20 +56,12 @@ getConvergenceTokenInternal(BasicBlockType *BB) { "Output type must be an intrinsic instruction."); for (auto &I : *BB) { - if (auto *II = dyn_cast(&I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::experimental_convergence_entry: - case Intrinsic::experimental_convergence_loop: - return II; - case Intrinsic::experimental_convergence_anchor: { - auto Bundle = II->getOperandBundle(LLVMContext::OB_convergencectrl); - assert(Bundle->Inputs.size() == 1 && - Bundle->Inputs[0]->getType()->isTokenTy()); - auto TII = dyn_cast(Bundle->Inputs[0].get()); - assert(TII != nullptr); - return TII; - } - } + if (auto *CI = dyn_cast(&I)) { + // Make sure that the anchor or entry intrinsics did not reach here with a + // parent token. This should have failed the verifier. + assert(CI->isLoop() || + !CI->getOperandBundle(LLVMContext::OB_convergencectrl)); + return CI; } if (auto *CI = dyn_cast(&I)) { From 30e902215c44ba8428a8871573bb41cdf2a0fa4a Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sun, 12 Jan 2025 23:40:25 -0500 Subject: [PATCH 008/102] [AMDGPU] Fix an invalid cast in `AMDGPULateCodeGenPrepare::visitLoadInst` (#122494) Fixes: SWDEV-507695 --- .../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 7 +++- .../CodeGen/AMDGPU/invalid-cast-load-i1.ll | 37 +++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 830b50307f837..f4e651ec477d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -464,8 +464,11 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { NewLd->setMetadata(LLVMContext::MD_range, nullptr); unsigned ShAmt = Adjust * 8; - auto *NewVal = IRB.CreateBitCast( - IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType()); + Value *NewVal = IRB.CreateBitCast( + IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), + DL.typeSizeEqualsStoreSize(LI.getType()) ? IntNTy + : LI.getType()), + LI.getType()); LI.replaceAllUsesWith(NewVal); DeadInsts.emplace_back(&LI); diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll new file mode 100644 index 0000000000000..621187100f323 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + +define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) { +; CHECK-LABEL: load_idx_idy: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 6 +; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_addc_u32 s1, s1, s5 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:4 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_endpgm +entry: + %disp1 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() + %gep_y = getelementptr i8, ptr addrspace(4) %disp1, i64 6 + %L = load i1, ptr addrspace(4) %gep_y, align 1 + %idxprom = sext i1 %L to i64 + %gep0 = getelementptr <32 x i16>, ptr addrspace(4) %disp, i64 %idxprom + %gep1 = getelementptr i8, ptr addrspace(4) %gep0, i64 4 + %L1 = load i8, ptr addrspace(4) %gep1 + store i8 %L1, ptr %g + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef nonnull align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } From d9c307d9c48d6cc67368c918fa4f337e0cd1cdd3 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Mon, 13 Jan 2025 10:11:40 +0530 Subject: [PATCH 009/102] [AMDGPU][NewPM] Port AMDGPURemoveIncompatibleFunctions to NPM (#122261) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../AMDGPURemoveIncompatibleFunctions.cpp | 53 +++++++++++++------ .../AMDGPURemoveIncompatibleFunctions.h | 26 +++++++++ .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 ++- .../AMDGPU/remove-incompatible-functions.ll | 8 +++ .../AMDGPU/remove-incompatible-s-time.ll | 10 ++++ .../remove-incompatible-wave32-feature.ll | 8 +++ 8 files changed, 96 insertions(+), 18 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ad5ee75f0c5d1..78667e628ec1e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -380,7 +380,7 @@ extern char &AMDGPUAnnotateUniformValuesLegacyPassID; void initializeAMDGPUCodeGenPreparePass(PassRegistry&); extern char &AMDGPUCodeGenPrepareID; -void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &); +void initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(PassRegistry &); extern char &AMDGPURemoveIncompatibleFunctionsID; void initializeAMDGPULateCodeGenPrepareLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 182e825a59a41..da594be992cb4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -26,6 +26,7 @@ MODULE_PASS("amdgpu-perf-hint", AMDGPUPerfHintAnalysisPass( *static_cast(this))) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) +MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) #undef MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp index 3a87070a326c2..e2e5c57397d02 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -33,25 +34,16 @@ namespace { using Generation = AMDGPUSubtarget::Generation; -class AMDGPURemoveIncompatibleFunctions : public ModulePass { +class AMDGPURemoveIncompatibleFunctions { public: - static char ID; - AMDGPURemoveIncompatibleFunctions(const TargetMachine *TM = nullptr) - : ModulePass(ID), TM(TM) { + : TM(TM) { assert(TM && "No TargetMachine!"); } - - StringRef getPassName() const override { - return "AMDGPU Remove Incompatible Functions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override {} - /// Checks a single function, returns true if the function must be deleted. bool checkFunction(Function &F); - bool runOnModule(Module &M) override { + bool run(Module &M) { assert(TM->getTargetTriple().isAMDGCN()); SmallVector FnsToDelete; @@ -71,6 +63,28 @@ class AMDGPURemoveIncompatibleFunctions : public ModulePass { const TargetMachine *TM = nullptr; }; +class AMDGPURemoveIncompatibleFunctionsLegacy : public ModulePass { +public: + static char ID; + + AMDGPURemoveIncompatibleFunctionsLegacy(const TargetMachine *TM) + : ModulePass(ID), TM(TM) {} + + bool runOnModule(Module &M) override { + AMDGPURemoveIncompatibleFunctions Pass(TM); + return Pass.run(M); + } + + StringRef getPassName() const override { + return "AMDGPU Remove Incompatible Functions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override {} + +private: + const TargetMachine *TM = nullptr; +}; + StringRef getFeatureName(unsigned Feature) { for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) if (Feature == KV.Value) @@ -131,6 +145,15 @@ void reportFunctionRemoved(Function &F, unsigned Feature) { } } // end anonymous namespace +PreservedAnalyses +AMDGPURemoveIncompatibleFunctionsPass::run(Module &M, + ModuleAnalysisManager &MAM) { + AMDGPURemoveIncompatibleFunctions Impl(TM); + if (Impl.run(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) { if (F.isDeclaration()) return false; @@ -182,12 +205,12 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) { return false; } -INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctions, DEBUG_TYPE, +INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctionsLegacy, DEBUG_TYPE, "AMDGPU Remove Incompatible Functions", false, false) -char AMDGPURemoveIncompatibleFunctions::ID = 0; +char AMDGPURemoveIncompatibleFunctionsLegacy::ID = 0; ModulePass * llvm::createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *TM) { - return new AMDGPURemoveIncompatibleFunctions(TM); + return new AMDGPURemoveIncompatibleFunctionsLegacy(TM); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h new file mode 100644 index 0000000000000..e4c858588ece8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.h @@ -0,0 +1,26 @@ +//===- AMDGPURemoveIncompatibleFunctions.h ----------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H +#define LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class AMDGPURemoveIncompatibleFunctionsPass + : public PassInfoMixin { + const TargetMachine *TM; + +public: + AMDGPURemoveIncompatibleFunctionsPass(const TargetMachine &TM) : TM(&TM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_REMOVEINCOMPATIBLEFUNCTIONS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0c9d7d00a8a4a..6058f9709c38c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -23,6 +23,7 @@ #include "AMDGPUISelDAGToDAG.h" #include "AMDGPUMacroFusion.h" #include "AMDGPUPerfHintAnalysis.h" +#include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUSplitModule.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" @@ -507,7 +508,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPULateCodeGenPrepareLegacyPass(*PR); - initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); + initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); initializeAMDGPUReserveWWMRegsPass(*PR); @@ -1925,7 +1926,8 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( } void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { - // TODO: Missing AMDGPURemoveIncompatibleFunctions + if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) + addPass(AMDGPURemoveIncompatibleFunctionsPass(TM)); addPass(AMDGPUPrintfRuntimeBindingPass()); if (LowerCtorDtor) diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll index e0b694ee58f0e..0359bb7183974 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll @@ -4,11 +4,19 @@ ; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=bonaire -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX7,IR %s +; RUN: FileCheck --check-prefix=WARN-GFX7 %s < %t + ; RUN: llc -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s ; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=fiji -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX8,IR %s +; RUN: FileCheck --check-prefix=WARN-GFX8 %s < %t + ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX9,GFX906,IR %s ; RUN: FileCheck --check-prefix=WARN-GFX906 %s < %t diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll index 32fed3ba22c59..676ba1480e6d2 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll @@ -4,11 +4,21 @@ ; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s +; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s + ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s ; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s +; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s + ; Note: This test checks the IR, but also has a run line to codegen the file just to check we ; do not crash when trying to select those functions. diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll index 406c953a06d97..75a388eb1229b 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll @@ -12,10 +12,18 @@ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s + ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ ; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\ +; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s + ; WARN-GFX906: removing function 'needs_wavefrontsize32': +wavefrontsize32 is not supported on the current target ; WARN-GFX906-NOT: not supported From 7e9123e35c2249ffc05b87aeafef7a2fbdb5a8b7 Mon Sep 17 00:00:00 2001 From: wldfngrs Date: Mon, 13 Jan 2025 05:46:53 +0100 Subject: [PATCH 010/102] [libc][math][c23] Add tanf16 function (#121018) - Implementation of tan for 16-bit floating point inputs. - Exhaustive tests across the 16-bit input range --- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/headers/math/index.rst | 2 +- libc/include/math.yaml | 7 ++ libc/src/math/CMakeLists.txt | 1 + libc/src/math/generic/CMakeLists.txt | 19 ++++ libc/src/math/generic/sincosf16_utils.h | 25 +++-- libc/src/math/generic/tanf16.cpp | 115 +++++++++++++++++++++++ libc/src/math/generic/tanpif16.cpp | 2 +- libc/src/math/tanf16.h | 21 +++++ libc/test/src/math/CMakeLists.txt | 11 +++ libc/test/src/math/cosf16_test.cpp | 2 +- libc/test/src/math/smoke/CMakeLists.txt | 11 +++ libc/test/src/math/smoke/tanf16_test.cpp | 34 +++++++ libc/test/src/math/tanf16_test.cpp | 40 ++++++++ 14 files changed, 275 insertions(+), 16 deletions(-) create mode 100644 libc/src/math/generic/tanf16.cpp create mode 100644 libc/src/math/tanf16.h create mode 100644 libc/test/src/math/smoke/tanf16_test.cpp create mode 100644 libc/test/src/math/tanf16_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index e7b049c0a6638..723853b2230ae 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -721,6 +721,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.sinhf16 libc.src.math.sinpif16 libc.src.math.sqrtf16 + libc.src.math.tanf16 libc.src.math.tanhf16 libc.src.math.tanpif16 libc.src.math.totalorderf16 diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst index 2808165ad539b..8548e4a5773bc 100644 --- a/libc/docs/headers/math/index.rst +++ b/libc/docs/headers/math/index.rst @@ -346,7 +346,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | sqrt | |check| | |check| | |check| | |check| | |check| | 7.12.7.10 | F.10.4.10 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| tan | |check| | |check| | | | | 7.12.4.7 | F.10.1.7 | +| tan | |check| | |check| | | |check| | | 7.12.4.7 | F.10.1.7 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | tanh | |check| | | | |check| | | 7.12.5.6 | F.10.2.6 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/include/math.yaml b/libc/include/math.yaml index 831d045745677..3a660a59d3605 100644 --- a/libc/include/math.yaml +++ b/libc/include/math.yaml @@ -2418,6 +2418,13 @@ functions: return_type: float arguments: - type: float + - name: tanf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: tanhf standards: - stdc diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index e4e2c49642f2d..fe5ebd793b40a 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -501,6 +501,7 @@ add_math_entrypoint_object(sqrtf128) add_math_entrypoint_object(tan) add_math_entrypoint_object(tanf) +add_math_entrypoint_object(tanf16) add_math_entrypoint_object(tanh) add_math_entrypoint_object(tanhf) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 382f5b362e2eb..0e57051807b33 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -661,6 +661,25 @@ add_entrypoint_object( ${libc_opt_high_flag} ) +add_entrypoint_object( + tanf16 + SRCS + tanf16.cpp + HDRS + ../tanf16.h + DEPENDS + .sincosf16_utils + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.multiply_add + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.types +) + add_entrypoint_object( tanpif16 SRCS diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h index 87b1dde560c5e..133896b5de7a3 100644 --- a/libc/src/math/generic/sincosf16_utils.h +++ b/libc/src/math/generic/sincosf16_utils.h @@ -47,24 +47,23 @@ LIBC_INLINE int32_t range_reduction_sincospif16(float x, float &y) { // Recall, range reduction: // k = round(x * 32/pi) -// y = x * 32/pi - k // -// The constant 0x1.45f306dc9c883p3 is 32/pi rounded to double-precision. -// 32/pi is generated by Sollya with the following commands: -// > display = hexadecimal; -// > round(32/pi, D, RN); -// -// The precision choice of 'double' is to minimize rounding errors -// in this initial scaling step, preserving enough bits so errors accumulated -// while computing the subtraction: y = x * 32/pi - round(x * 32/pi) +// The precision choice of 'double' in the following function is to minimize +// rounding errors in this initial scaling step, +// preserving enough bits so errors accumulated while computing the subtraction: +// y = x * 32/pi - round(x * 32/pi) // are beyond the least-significant bit of single-precision used during // further intermediate computation. LIBC_INLINE int32_t range_reduction_sincosf16(float x, float &y) { - double prod = x * 0x1.45f306dc9c883p3; - double kf = fputil::nearest_integer(prod); - y = static_cast(prod - kf); + // Generated by Sollya with: + // > D(32/pi); + constexpr double THIRTYTWO_OVER_PI = 0x1.45f306dc9c883p3; - return static_cast(kf); + double prod = x * THIRTYTWO_OVER_PI; + double kd = fputil::nearest_integer(prod); + y = static_cast(prod - kd); + + return static_cast(kd); } static LIBC_INLINE void sincosf16_poly_eval(int32_t k, float y, float &sin_k, diff --git a/libc/src/math/generic/tanf16.cpp b/libc/src/math/generic/tanf16.cpp new file mode 100644 index 0000000000000..48aa51e456a8a --- /dev/null +++ b/libc/src/math/generic/tanf16.cpp @@ -0,0 +1,115 @@ +//===-- Half-precision tan(x) function ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "src/math/tanf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "sincosf16_utils.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { + +constexpr size_t N_EXCEPTS = 9; + +constexpr fputil::ExceptValues TANF16_EXCEPTS{{ + // (input, RZ output, RU offset, RD offset, RN offset) + {0x2894, 0x2894, 1, 0, 1}, + {0x3091, 0x3099, 1, 0, 0}, + {0x3098, 0x30a0, 1, 0, 0}, + {0x55ed, 0x3911, 1, 0, 0}, + {0x607b, 0xc638, 0, 1, 1}, + {0x674e, 0x3b7d, 1, 0, 0}, + {0x6807, 0x4014, 1, 0, 1}, + {0x6f4d, 0xbe19, 0, 1, 1}, + {0x7330, 0xcb62, 0, 1, 0}, +}}; + +LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits xbits(x); + + uint16_t x_u = xbits.uintval(); + uint16_t x_abs = x_u & 0x7fff; + bool x_sign = x_u >> 15; + float xf = x; + + // Handle exceptional values + if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign); + LIBC_UNLIKELY(r.has_value())) + return r.value(); + + // |x| <= 0x1.d1p-5 + if (LIBC_UNLIKELY(x_abs <= 0x2b44)) { + // |x| <= 0x1.398p-11 + if (LIBC_UNLIKELY(x_abs <= 0x10e6)) { + // tan(+/-0) = +/-0 + if (LIBC_UNLIKELY(x_abs == 0)) + return x; + + int rounding = fputil::quick_get_round(); + + // Exhaustive tests show that, when: + // x > 0, and rounding upward or + // x < 0, and rounding downward then, + // tan(x) = x * 2^-11 + x + if ((xbits.is_pos() && rounding == FE_UPWARD) || + (xbits.is_neg() && rounding == FE_DOWNWARD)) + return fputil::cast(fputil::multiply_add(xf, 0x1.0p-11f, xf)); + return x; + } + + float xsq = xf * xf; + + // Degree-6 minimax odd polynomial of tan(x) generated by Sollya with: + // > P = fpminimax(tan(x)/x, [|0, 2, 4, 6|], [|1, SG...|], [0, pi/32]); + float result = fputil::polyeval(xsq, 0x1p0f, 0x1.555556p-2f, 0x1.110ee4p-3f, + 0x1.be80f6p-5f); + + return fputil::cast(xf * result); + } + + // tan(+/-inf) = NaN, and tan(NaN) = NaN + if (LIBC_UNLIKELY(x_abs >= 0x7c00)) { + // x = +/-inf + if (x_abs == 0x7c00) { + fputil::set_errno_if_required(EDOM); + fputil::raise_except_if_required(FE_INVALID); + } + + return x + FPBits::quiet_nan().get_val(); + } + + // Range reduction: + // For |x| > pi/32, we perform range reduction as follows: + // Find k and y such that: + // x = (k + y) * pi/32; + // k is an integer, |y| < 0.5 + // + // This is done by performing: + // k = round(x * 32/pi) + // y = x * 32/pi - k + // + // Once k and y are computed, we then deduce the answer by the formula: + // tan(x) = sin(x) / cos(x) + // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k) + float sin_k, cos_k, sin_y, cosm1_y; + sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y); + + // Note that, cosm1_y = cos_y - 1: + using fputil::multiply_add; + return fputil::cast( + multiply_add(sin_y, cos_k, multiply_add(cosm1_y, sin_k, sin_k)) / + multiply_add(sin_y, -sin_k, multiply_add(cosm1_y, cos_k, cos_k))); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/tanpif16.cpp b/libc/src/math/generic/tanpif16.cpp index 67635536ee319..cf4f9917d4537 100644 --- a/libc/src/math/generic/tanpif16.cpp +++ b/libc/src/math/generic/tanpif16.cpp @@ -79,7 +79,7 @@ LLVM_LIBC_FUNCTION(float16, tanpif16, (float16 x)) { // k = round(x * 32) // y = x * 32 - k // - // Once k and y are computed, we then deduce the answer by tthe formula: + // Once k and y are computed, we then deduce the answer by the formula: // tan(x) = sin(x) / cos(x) // = (sin_y * cos_k + cos_y * sin_k) / (cos_y * cos_k - sin_y * sin_k) float xf = x; diff --git a/libc/src/math/tanf16.h b/libc/src/math/tanf16.h new file mode 100644 index 0000000000000..bf1b61e9837f7 --- /dev/null +++ b/libc/src/math/tanf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for tanf16 ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_TANF16_H +#define LLVM_LIBC_SRC_MATH_TANF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 tanf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_TANF16_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 16e7d4957ba11..ae8518ee4b4cc 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -190,6 +190,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + tanf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + tanf16_test.cpp + DEPENDS + libc.src.math.tanf16 +) + add_fp_unittest( tanpif16_test NEED_MPFR diff --git a/libc/test/src/math/cosf16_test.cpp b/libc/test/src/math/cosf16_test.cpp index 9e4687f0325c4..b744e7817e4ba 100644 --- a/libc/test/src/math/cosf16_test.cpp +++ b/libc/test/src/math/cosf16_test.cpp @@ -17,7 +17,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; // Range: [0, Inf] static constexpr uint16_t POS_START = 0x0000U; -static constexpr uint16_t POS_STOP = 0x7c00u; +static constexpr uint16_t POS_STOP = 0x7c00U; // Range: [-Inf, 0] static constexpr uint16_t NEG_START = 0x8000U; diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 31f85a3ecfd27..e23e7f41222d4 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -121,6 +121,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + tanf16_test + SUITE + libc-math-smoke-tests + SRCS + tanf16_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.tanf16 +) + add_fp_unittest( tanpif16_test SUITE diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp new file mode 100644 index 0000000000000..39d1182ba891e --- /dev/null +++ b/libc/test/src/math/smoke/tanf16_test.cpp @@ -0,0 +1,34 @@ +//===-- Unittests for tanf16 ----------------------------------------------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "src/errno/libc_errno.h" +#include "src/math/tanf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcTanf16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::tanf16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::tanf16(neg_zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/tanf16_test.cpp b/libc/test/src/math/tanf16_test.cpp new file mode 100644 index 0000000000000..f2e874182efc1 --- /dev/null +++ b/libc/test/src/math/tanf16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for tanf16 ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/tanf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf] +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0] +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcTanf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, + LIBC_NAMESPACE::tanf16(x), 0.5); + } +} + +TEST_F(LlvmLibcTanf16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x, + LIBC_NAMESPACE::tanf16(x), 0.5); + } +} From e5c7cfa2feadd65b7a11b5433c21b2272b5f7b16 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Mon, 13 Jan 2025 10:38:24 +0530 Subject: [PATCH 011/102] [CodeGen][NewPM] Use proper NPM AtomicExpandPass in AMDGPU (#122086) `PassRegistry.def` already has this entry, but the dummy definition was being pulled instead. I couldn't reproduce the build failures that FIXME referenced, maybe the Dummy pass getting in the way was part of the cause. --- llvm/include/llvm/Passes/MachinePassRegistry.def | 1 - llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 29763995e8b51..8a43197d2d45e 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -194,7 +194,6 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS( #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, PASS_NAME) #endif -DUMMY_FUNCTION_PASS("atomic-expand", AtomicExpandPass) #undef DUMMY_FUNCTION_PASS #ifndef DUMMY_MACHINE_MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6058f9709c38c..f8b60630bb7f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -49,6 +49,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/CodeGen/AtomicExpand.h" #include "llvm/CodeGen/DeadMachineInstructionElim.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -1957,8 +1958,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy)); - // FIXME: Adding atomic-expand manages to break -passes=atomic-expand - // addPass(AtomicExpandPass(TM)); + addPass(AtomicExpandPass(&TM)); if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(AMDGPUPromoteAllocaPass(TM)); From f70d78204426233befcbedb07e8bb962f543ab20 Mon Sep 17 00:00:00 2001 From: Shourya Goel Date: Mon, 13 Jan 2025 11:23:36 +0530 Subject: [PATCH 012/102] [libc][complex] fix compiler support matrix for cfloat128 (#122593) Before this patch, [godbolt](https://godbolt.org/z/6PPsvv9qd) failed to compile `cfloat128` with `-ffreestanding` but with the patch, the compilation succeeds, [godbolt](https://godbolt.org/z/4M8zzejss). Fixes: #122500 cc: @nickdesaulniers --- libc/include/llvm-libc-types/cfloat128.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/libc/include/llvm-libc-types/cfloat128.h b/libc/include/llvm-libc-types/cfloat128.h index f76a0c1c2f5af..83fad87910137 100644 --- a/libc/include/llvm-libc-types/cfloat128.h +++ b/libc/include/llvm-libc-types/cfloat128.h @@ -18,22 +18,24 @@ // // TODO: Update the complex variant of C23 `_Float128` type detection again when // clang supports it. -#if defined(__STDC_IEC_60559_COMPLEX__) && !defined(__clang__) -#if !defined(__cplusplus) -#define LIBC_TYPES_HAS_CFLOAT128 -typedef _Complex _Float128 cfloat128; -#elif defined(__GNUC__) && __GNUC__ >= 13 -#define LIBC_TYPES_HAS_CFLOAT128 -typedef _Complex _Float128 cfloat128; -#endif -#elif __clang_major__ >= 11 && \ +#ifdef __clang__ +#if (__clang_major__ >= 11) && \ (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) // Use _Complex __float128 type. clang uses __SIZEOF_FLOAT128__ or __FLOAT128__ // macro to notify the availability of __float128 type: // https://reviews.llvm.org/D15120 #define LIBC_TYPES_HAS_CFLOAT128 typedef _Complex __float128 cfloat128; -#elif (LDBL_MANT_DIG == 113) +#endif +#elif defined(__GNUC__) +#if (defined(__STDC_IEC_60559_COMPLEX__) || defined(__SIZEOF_FLOAT128__)) && \ + (__GNUC__ >= 13 || (!defined(__cplusplus))) +#define LIBC_TYPES_HAS_CFLOAT128 +typedef _Complex _Float128 cfloat128; +#endif +#endif + +#if !defined(LIBC_TYPES_HAS_CFLOAT128) && (LDBL_MANT_DIG == 113) #define LIBC_TYPES_HAS_CFLOAT128 #define LIBC_TYPES_CFLOAT128_IS_COMPLEX_LONG_DOUBLE typedef _Complex long double cfloat128; From 6f6306cf7115394625c749fbfeb2c6015915da16 Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Mon, 13 Jan 2025 11:42:13 +0530 Subject: [PATCH 013/102] [OpenMP] codegen support for masked combined construct masked taskloop (#121914) Added codegen support for combined masked constructs `masked taskloop.` Added implementation for `EmitOMPMaskedTaskLoopDirective`. --------- Co-authored-by: Chandra Ghale --- clang/lib/CodeGen/CGStmt.cpp | 2 +- clang/lib/CodeGen/CGStmtOpenMP.cpp | 12 +++++ clang/lib/CodeGen/CodeGenFunction.h | 1 + clang/test/OpenMP/masked_taskloop_codegen.c | 50 +++++++++++++++++++++ 4 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 clang/test/OpenMP/masked_taskloop_codegen.c diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index ee10e586d9250..f9258a396b7d0 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -332,7 +332,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { EmitOMPMasterTaskLoopDirective(cast(*S)); break; case Stmt::OMPMaskedTaskLoopDirectiveClass: - llvm_unreachable("masked taskloop directive not supported yet."); + EmitOMPMaskedTaskLoopDirective(cast(*S)); break; case Stmt::OMPMasterTaskLoopSimdDirectiveClass: EmitOMPMasterTaskLoopSimdDirective( diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 94daf059edba0..2b4ca65e169a6 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -7982,6 +7982,18 @@ void CodeGenFunction::EmitOMPMasterTaskLoopDirective( CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getBeginLoc()); } +void CodeGenFunction::EmitOMPMaskedTaskLoopDirective( + const OMPMaskedTaskLoopDirective &S) { + auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + EmitOMPTaskLoopBasedDirective(S); + }; + auto LPCRegion = + CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S); + OMPLexicalScope Scope(*this, S, std::nullopt, /*EmitPreInitStmt=*/false); + CGM.getOpenMPRuntime().emitMaskedRegion(*this, CodeGen, S.getBeginLoc()); +} + void CodeGenFunction::EmitOMPMasterTaskLoopSimdDirective( const OMPMasterTaskLoopSimdDirective &S) { auto &&CodeGen = [this, &S](CodeGenFunction &CGF, PrePostActionTy &Action) { diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 86328db345508..311f2ae94d046 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3870,6 +3870,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S); void EmitOMPTaskLoopSimdDirective(const OMPTaskLoopSimdDirective &S); void EmitOMPMasterTaskLoopDirective(const OMPMasterTaskLoopDirective &S); + void EmitOMPMaskedTaskLoopDirective(const OMPMaskedTaskLoopDirective &S); void EmitOMPMasterTaskLoopSimdDirective(const OMPMasterTaskLoopSimdDirective &S); void diff --git a/clang/test/OpenMP/masked_taskloop_codegen.c b/clang/test/OpenMP/masked_taskloop_codegen.c new file mode 100644 index 0000000000000..26f54c1797bbe --- /dev/null +++ b/clang/test/OpenMP/masked_taskloop_codegen.c @@ -0,0 +1,50 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --prefix-filecheck-ir-name _ --version 5 +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=52 -x c -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics +#define N 100 +void masked_taskloop(){ + #pragma omp masked taskloop + for( int i = 0; i < N; i++) + ; + +} + +int main() +{ + masked_taskloop(); +} +// CHECK-LABEL: define dso_local void @masked_taskloop( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_masked(ptr @[[GLOB1]], i32 [[TMP0]], i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[TMP2]], label %[[OMP_IF_THEN:.*]], label %[[OMP_IF_END:.*]] +// CHECK: [[OMP_IF_THEN]]: +// CHECK-NEXT: call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP3:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 80, i64 0, ptr @.omp_task_entry.) +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 5 +// CHECK-NEXT: store i64 0, ptr [[TMP5]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 6 +// CHECK-NEXT: store i64 99, ptr [[TMP6]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 7 +// CHECK-NEXT: store i64 1, ptr [[TMP7]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 9 +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 0, i64 8, i1 false) +// CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_taskloop(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP3]], i32 1, ptr [[TMP5]], ptr [[TMP6]], i64 [[TMP9]], i32 1, i32 0, i64 0, ptr null) +// CHECK-NEXT: call void @__kmpc_end_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_end_masked(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: br label %[[OMP_IF_END]] +// CHECK: [[OMP_IF_END]]: +// CHECK-NEXT: ret void +// +// CHECK-LABEL: define dso_local i32 @main( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @masked_taskloop() +// CHECK-NEXT: ret i32 0 + From 19df4e0be8c353cb43b3ee7b61810ea5626ea61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= Date: Mon, 13 Jan 2025 08:21:07 +0100 Subject: [PATCH 014/102] [mlir][linalg] Fix neutral elt for softmax (#118952) The decomposition of `linalg.softmax` uses `maxnumf`, but the identity element that is used in the generated code is the one for `maximumf`. They are not the same, as the identity for `maxnumf` is `NaN`, while the one of `maximumf` is `-Infty`. This is wrong and prevents the maxnumf from being folded. Related to #114595, which fixed the folder for maxnumf. --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 2 +- mlir/test/Dialect/Linalg/transform-op-decompose.mlir | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 8973e87c063b3..c13b663dbf05b 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2890,7 +2890,7 @@ FailureOr> SoftmaxOp::decomposeOperation(OpBuilder &b) { dims.erase(dims.begin() + reductionDim); // Step 1: Compute max along dim. Value outputReduce = b.create(loc, dims, elementType); - Value neutralForMaxF = arith::getIdentityValue(arith::AtomicRMWKind::maximumf, + Value neutralForMaxF = arith::getIdentityValue(arith::AtomicRMWKind::maxnumf, elementType, b, loc, /*useOnlyFiniteValue=*/true); Value neutralForMaxFInit = diff --git a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir index 2e211d2fa7dbe..72acf43361f50 100644 --- a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir @@ -210,7 +210,7 @@ func.func @softmax(%arg0: tensor<2x16x32xf32>, %dst: tensor<2x16x32xf32>) -> ten // CHECK-LABEL: func.func @softmax( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>, %[[DST:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { // CHECK-DAG: %[[D1:.+]] = tensor.empty() : tensor<2x16xf32> -// CHECK-DAG: %[[CST:.+]] = arith.constant -3.40282347E+38 : f32 +// CHECK-DAG: %[[CST:.+]] = arith.constant 0xFFC00000 : f32 // CHECK: %[[D2:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[D1]] : tensor<2x16xf32>) -> tensor<2x16xf32> // CHECK: %[[D3:.+]] = linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP1]]], iterator_types = ["parallel", // CHECK-SAME: "parallel", "reduction"]} ins(%[[ARG0]] : tensor<2x16x32xf32>) outs(%[[D2]] : tensor<2x16xf32>) { From 928007386801756c4d60f119fab529edec56444b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 12 Jan 2025 23:50:58 -0800 Subject: [PATCH 015/102] Partially revert "[TableGen] Avoid repeated hash lookups (NFC) (#122586)" This partially reverts commit 07ff786e39e2190449998d3af1000454dee501be. The hunk being reverted in this patch seems to break: tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml under LLVM_ENABLE_EXPENSIVE_CHECKS. --- llvm/utils/TableGen/Common/CodeGenSchedule.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 7f4230affca09..1fe322c88bb0f 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -415,9 +415,9 @@ void CodeGenSchedModels::collectSTIPredicates() { for (const Record *R : Records.getAllDerivedDefinitions("STIPredicate")) { const Record *Decl = R->getValueAsDef("Declaration"); - const auto [It, Inserted] = - Decl2Index.try_emplace(Decl, STIPredicates.size()); - if (Inserted) { + const auto It = Decl2Index.find(Decl); + if (It == Decl2Index.end()) { + Decl2Index[Decl] = STIPredicates.size(); STIPredicateFunction Predicate(Decl); Predicate.addDefinition(R); STIPredicates.emplace_back(std::move(Predicate)); From 386c2f575ec3684d7c04a910e9fac197e31c96ed Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 13 Jan 2025 16:11:31 +0800 Subject: [PATCH 016/102] [SLPVectorizer] Refactor HorizontalReduction::createOp (NFC) (#121549) This patch simplifies select-based integer min/max reductions by utilizing `llvm::getMinMaxReductionPredicate`, and generates intrinsic-based min/max reductions by utilizing `llvm::getMinMaxReductionIntrinsicOp`. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 35 ++++++------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8a6fbd808de35..e3487b5015342 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -19431,38 +19431,23 @@ class HorizontalReduction { return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); } - case RecurKind::FMax: - return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); - case RecurKind::FMin: - return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); - case RecurKind::FMaximum: - return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS); - case RecurKind::FMinimum: - return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS); case RecurKind::SMax: - if (UseSelect) { - Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); case RecurKind::SMin: - if (UseSelect) { - Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); case RecurKind::UMax: - if (UseSelect) { - Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); case RecurKind::UMin: if (UseSelect) { - Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); + CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind); + Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name); return Builder.CreateSelect(Cmp, LHS, RHS, Name); } - return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); + [[fallthrough]]; + case RecurKind::FMax: + case RecurKind::FMin: + case RecurKind::FMaximum: + case RecurKind::FMinimum: { + Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(Kind); + return Builder.CreateBinaryIntrinsic(Id, LHS, RHS); + } default: llvm_unreachable("Unknown reduction operation."); } From 6ac1259b95da200007b09434bdf5c16a10a05cf5 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Mon, 13 Jan 2025 14:14:13 +0530 Subject: [PATCH 017/102] Reapply "Spiller: Detach legacy pass and supply analyses instead (#119181)" (#122665) Makes Inline Spiller amenable to the new PM. This reapplies commit a531800344dc54e9c197a13b22e013f919f3f5e1 reverted because of two unused private members reported on sanitizer bots. --- llvm/include/llvm/CodeGen/Spiller.h | 16 ++++++++++-- llvm/lib/CodeGen/InlineSpiller.cpp | 40 +++++++++++------------------ llvm/lib/CodeGen/RegAllocBasic.cpp | 16 ++++++++---- llvm/lib/CodeGen/RegAllocGreedy.cpp | 4 ++- llvm/lib/CodeGen/RegAllocPBQP.cpp | 5 +++- 5 files changed, 47 insertions(+), 34 deletions(-) diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h index 51ad36bc6b1f8..3132cefeb6c68 100644 --- a/llvm/include/llvm/CodeGen/Spiller.h +++ b/llvm/include/llvm/CodeGen/Spiller.h @@ -19,6 +19,10 @@ class MachineFunction; class MachineFunctionPass; class VirtRegMap; class VirtRegAuxInfo; +class LiveIntervals; +class LiveStacks; +class MachineDominatorTree; +class MachineBlockFrequencyInfo; /// Spiller interface. /// @@ -41,12 +45,20 @@ class Spiller { virtual ArrayRef getReplacedRegs() = 0; virtual void postOptimization() {} + + struct RequiredAnalyses { + LiveIntervals &LIS; + LiveStacks &LSS; + MachineDominatorTree &MDT; + const MachineBlockFrequencyInfo &MBFI; + }; }; /// Create and return a spiller that will insert spill code directly instead /// of deferring though VirtRegMap. -Spiller *createInlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, - VirtRegMap &VRM, VirtRegAuxInfo &VRAI); +Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses, + MachineFunction &MF, VirtRegMap &VRM, + VirtRegAuxInfo &VRAI); } // end namespace llvm diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 64f290f5930a1..f6681540e2286 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -75,7 +75,6 @@ RestrictStatepointRemat("restrict-statepoint-remat", cl::desc("Restrict remat for statepoint operands")); namespace { - class HoistSpillHelper : private LiveRangeEdit::Delegate { MachineFunction &MF; LiveIntervals &LIS; @@ -128,15 +127,11 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate { DenseMap &SpillsToIns); public: - HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf, - VirtRegMap &vrm) - : MF(mf), LIS(pass.getAnalysis().getLIS()), - LSS(pass.getAnalysis().getLS()), - MDT(pass.getAnalysis().getDomTree()), + HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses, + MachineFunction &mf, VirtRegMap &vrm) + : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT), VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), - TRI(*mf.getSubtarget().getRegisterInfo()), - MBFI( - pass.getAnalysis().getMBFI()), + TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI), IPA(LIS, mf.getNumBlockIDs()) {} void addToMergeableSpills(MachineInstr &Spill, int StackSlot, @@ -150,12 +145,10 @@ class InlineSpiller : public Spiller { MachineFunction &MF; LiveIntervals &LIS; LiveStacks &LSS; - MachineDominatorTree &MDT; VirtRegMap &VRM; MachineRegisterInfo &MRI; const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; - const MachineBlockFrequencyInfo &MBFI; // Variables that are valid during spill(), but used by multiple methods. LiveRangeEdit *Edit = nullptr; @@ -190,16 +183,12 @@ class InlineSpiller : public Spiller { ~InlineSpiller() override = default; public: - InlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, VirtRegMap &VRM, - VirtRegAuxInfo &VRAI) - : MF(MF), LIS(Pass.getAnalysis().getLIS()), - LSS(Pass.getAnalysis().getLS()), - MDT(Pass.getAnalysis().getDomTree()), - VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), - MBFI( - Pass.getAnalysis().getMBFI()), - HSpiller(Pass, MF, VRM), VRAI(VRAI) {} + InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF, + VirtRegMap &VRM, VirtRegAuxInfo &VRAI) + : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM), + MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), HSpiller(Analyses, MF, VRM), + VRAI(VRAI) {} void spill(LiveRangeEdit &) override; ArrayRef getSpilledRegs() override { return RegsToSpill; } @@ -237,10 +226,11 @@ Spiller::~Spiller() = default; void Spiller::anchor() {} -Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass, - MachineFunction &MF, VirtRegMap &VRM, - VirtRegAuxInfo &VRAI) { - return new InlineSpiller(Pass, MF, VRM, VRAI); +Spiller * +llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses, + MachineFunction &MF, VirtRegMap &VRM, + VirtRegAuxInfo &VRAI) { + return new InlineSpiller(Analyses, MF, VRM, VRAI); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index c05aa1e40e477..f3f34f890be11 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" @@ -187,6 +188,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); AU.addRequiredID(MachineDominatorsID); AU.addPreservedID(MachineDominatorsID); AU.addRequired(); @@ -310,16 +312,20 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) { << "********** Function: " << mf.getName() << '\n'); MF = &mf; + auto &MBFI = getAnalysis().getMBFI(); + auto &LiveStks = getAnalysis().getLS(); + auto &MDT = getAnalysis().getDomTree(); + RegAllocBase::init(getAnalysis().getVRM(), getAnalysis().getLIS(), getAnalysis().getLRM()); - VirtRegAuxInfo VRAI( - *MF, *LIS, *VRM, getAnalysis().getLI(), - getAnalysis().getMBFI(), - &getAnalysis().getPSI()); + VirtRegAuxInfo VRAI(*MF, *LIS, *VRM, + getAnalysis().getLI(), MBFI, + &getAnalysis().getPSI()); VRAI.calculateSpillWeightsAndHints(); - SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI)); + SpillerInstance.reset( + createInlineSpiller({*LIS, LiveStks, MDT, MBFI}, *MF, *VRM, VRAI)); allocatePhysRegs(); postOptimization(); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index b94992c20b119..66e9cf546b837 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2750,6 +2750,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { Bundles = &getAnalysis().getEdgeBundles(); SpillPlacer = &getAnalysis().getResult(); DebugVars = &getAnalysis().getLDV(); + auto &LSS = getAnalysis().getLS(); initializeCSRCost(); @@ -2770,7 +2771,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { getAnalysis().getAdvisor(*MF, *this); VRAI = std::make_unique(*MF, *LIS, *VRM, *Loops, *MBFI); - SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, *VRAI)); + SpillerInstance.reset( + createInlineSpiller({*LIS, LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI)); VRAI->calculateSpillWeightsAndHints(); diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp index 696c312e4ba00..e230a1be95c9f 100644 --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -794,6 +794,9 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { MachineBlockFrequencyInfo &MBFI = getAnalysis().getMBFI(); + auto &LiveStks = getAnalysis().getLS(); + auto &MDT = getAnalysis().getDomTree(); + VirtRegMap &VRM = getAnalysis().getVRM(); PBQPVirtRegAuxInfo VRAI( @@ -807,7 +810,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { VirtRegAuxInfo DefaultVRAI( MF, LIS, VRM, getAnalysis().getLI(), MBFI); std::unique_ptr VRegSpiller( - createInlineSpiller(*this, MF, VRM, DefaultVRAI)); + createInlineSpiller({LIS, LiveStks, MDT, MBFI}, MF, VRM, DefaultVRAI)); MF.getRegInfo().freezeReservedRegs(); From 84458e7d50f01ddc32fbf9c8216c77c778456ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Mon, 13 Jan 2025 09:46:45 +0100 Subject: [PATCH 018/102] [clang][ASTImporter] Not using primary context in lookup table (#118466) `ASTImporterLookupTable` did use the `getPrimaryContext` function to get the declaration context of the inserted items. This is problematic because the primary context can change during import of AST items, most likely if a definition of a previously not defined class is imported. (For any record the primary context is the definition if there is one.) The use of primary context is really not important, only for namespaces because these can be re-opened and lookup in one namespace block is not enough. This special search is now moved into ASTImporter instead of relying on the lookup table. --- clang/lib/AST/ASTImporter.cpp | 24 +++- clang/lib/AST/ASTImporterLookupTable.cpp | 20 +-- clang/unittests/AST/ASTImporterTest.cpp | 152 ++++++++++++++++++++++- 3 files changed, 181 insertions(+), 15 deletions(-) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 26d33b0d94795..dec4c7221bc77 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -3165,6 +3165,7 @@ ExpectedDecl ASTNodeImporter::VisitRecordDecl(RecordDecl *D) { if (Error Err = ImportImplicitMethods(DCXX, FoundCXX)) return std::move(Err); } + // FIXME: We can return FoundDef here. } PrevDecl = FoundRecord->getMostRecentDecl(); break; @@ -9064,9 +9065,26 @@ ASTImporter::findDeclsInToCtx(DeclContext *DC, DeclarationName Name) { // We can diagnose this only if we search in the redecl context. DeclContext *ReDC = DC->getRedeclContext(); if (SharedState->getLookupTable()) { - ASTImporterLookupTable::LookupResult LookupResult = - SharedState->getLookupTable()->lookup(ReDC, Name); - return FoundDeclsTy(LookupResult.begin(), LookupResult.end()); + if (ReDC->isNamespace()) { + // Namespaces can be reopened. + // Lookup table does not handle this, we must search here in all linked + // namespaces. + FoundDeclsTy Result; + SmallVector NSChain = + getCanonicalForwardRedeclChain( + dyn_cast(ReDC)); + for (auto *D : NSChain) { + ASTImporterLookupTable::LookupResult LookupResult = + SharedState->getLookupTable()->lookup(dyn_cast(D), + Name); + Result.append(LookupResult.begin(), LookupResult.end()); + } + return Result; + } else { + ASTImporterLookupTable::LookupResult LookupResult = + SharedState->getLookupTable()->lookup(ReDC, Name); + return FoundDeclsTy(LookupResult.begin(), LookupResult.end()); + } } else { DeclContext::lookup_result NoloadLookupResult = ReDC->noload_lookup(Name); FoundDeclsTy Result(NoloadLookupResult.begin(), NoloadLookupResult.end()); diff --git a/clang/lib/AST/ASTImporterLookupTable.cpp b/clang/lib/AST/ASTImporterLookupTable.cpp index 07d39dcee2583..4ed3198d7ea62 100644 --- a/clang/lib/AST/ASTImporterLookupTable.cpp +++ b/clang/lib/AST/ASTImporterLookupTable.cpp @@ -115,8 +115,9 @@ void ASTImporterLookupTable::remove(DeclContext *DC, NamedDecl *ND) { #ifndef NDEBUG if (!EraseResult) { std::string Message = - llvm::formatv("Trying to remove not contained Decl '{0}' of type {1}", - Name.getAsString(), DC->getDeclKindName()) + llvm::formatv( + "Trying to remove not contained Decl '{0}' of type {1} from a {2}", + Name.getAsString(), ND->getDeclKindName(), DC->getDeclKindName()) .str(); llvm_unreachable(Message.c_str()); } @@ -125,18 +126,18 @@ void ASTImporterLookupTable::remove(DeclContext *DC, NamedDecl *ND) { void ASTImporterLookupTable::add(NamedDecl *ND) { assert(ND); - DeclContext *DC = ND->getDeclContext()->getPrimaryContext(); + DeclContext *DC = ND->getDeclContext(); add(DC, ND); - DeclContext *ReDC = DC->getRedeclContext()->getPrimaryContext(); + DeclContext *ReDC = DC->getRedeclContext(); if (DC != ReDC) add(ReDC, ND); } void ASTImporterLookupTable::remove(NamedDecl *ND) { assert(ND); - DeclContext *DC = ND->getDeclContext()->getPrimaryContext(); + DeclContext *DC = ND->getDeclContext(); remove(DC, ND); - DeclContext *ReDC = DC->getRedeclContext()->getPrimaryContext(); + DeclContext *ReDC = DC->getRedeclContext(); if (DC != ReDC) remove(ReDC, ND); } @@ -161,7 +162,7 @@ void ASTImporterLookupTable::updateForced(NamedDecl *ND, DeclContext *OldDC) { ASTImporterLookupTable::LookupResult ASTImporterLookupTable::lookup(DeclContext *DC, DeclarationName Name) const { - auto DCI = LookupTable.find(DC->getPrimaryContext()); + auto DCI = LookupTable.find(DC); if (DCI == LookupTable.end()) return {}; @@ -178,7 +179,7 @@ bool ASTImporterLookupTable::contains(DeclContext *DC, NamedDecl *ND) const { } void ASTImporterLookupTable::dump(DeclContext *DC) const { - auto DCI = LookupTable.find(DC->getPrimaryContext()); + auto DCI = LookupTable.find(DC); if (DCI == LookupTable.end()) llvm::errs() << "empty\n"; const auto &FoundNameMap = DCI->second; @@ -196,8 +197,7 @@ void ASTImporterLookupTable::dump(DeclContext *DC) const { void ASTImporterLookupTable::dump() const { for (const auto &Entry : LookupTable) { DeclContext *DC = Entry.first; - StringRef Primary = DC->getPrimaryContext() ? " primary" : ""; - llvm::errs() << "== DC:" << cast(DC) << Primary << "\n"; + llvm::errs() << "== DC:" << cast(DC) << "\n"; dump(DC); } } diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index f2bfde9bed372..a0aaad6082d8c 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -6052,7 +6052,7 @@ TEST_P(ASTImporterLookupTableTest, EnumConstantDecl) { EXPECT_EQ(*Res.begin(), A); } -TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) { +TEST_P(ASTImporterLookupTableTest, LookupSearchesInActualNamespaceOnly) { TranslationUnitDecl *ToTU = getToTuDecl( R"( namespace N { @@ -6062,7 +6062,9 @@ TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) { } )", Lang_CXX03); - auto *N1 = + auto *N1 = FirstDeclMatcher().match( + ToTU, namespaceDecl(hasName("N"))); + auto *N2 = LastDeclMatcher().match(ToTU, namespaceDecl(hasName("N"))); auto *A = FirstDeclMatcher().match(ToTU, varDecl(hasName("A"))); DeclarationName Name = A->getDeclName(); @@ -6071,6 +6073,7 @@ TEST_P(ASTImporterLookupTableTest, LookupSearchesInTheWholeRedeclChain) { auto Res = LT.lookup(N1, Name); ASSERT_EQ(Res.size(), 1u); EXPECT_EQ(*Res.begin(), A); + EXPECT_TRUE(LT.lookup(N2, Name).empty()); } TEST_P(ASTImporterOptionSpecificTestBase, @@ -10170,6 +10173,151 @@ TEST_P(ImportTemplateParmDeclDefaultValue, FromD, FromDInherited); } +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch1) { + const char *ToCode = + R"( + namespace a { + } + namespace a { + struct X { int A; }; + } + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { char A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_FALSE(ImportedX); +} + +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch2) { + const char *ToCode = + R"( + namespace a { + struct X { int A; }; + } + namespace a { + } + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { char A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_FALSE(ImportedX); +} + +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceMatch1) { + const char *ToCode = + R"( + namespace a { + } + namespace a { + struct X { int A; }; + } + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { int A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ToX = FirstDeclMatcher().match( + ToTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_EQ(ImportedX, ToX); +} + +TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceMatch2) { + const char *ToCode = + R"( + namespace a { + struct X { int A; }; + } + namespace a { + } + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + const char *Code = + R"( + namespace a { + struct X { int A; }; + } + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = FirstDeclMatcher().match( + FromTU, cxxRecordDecl(hasName("X"))); + auto *ToX = FirstDeclMatcher().match( + ToTU, cxxRecordDecl(hasName("X"))); + auto *ImportedX = Import(FromX, Lang_CXX11); + EXPECT_EQ(ImportedX, ToX); +} + +TEST_P(ASTImporterLookupTableTest, PrimaryDCChangeAtImport) { + const char *ToCode = + R"( + template + struct X; + )"; + Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + auto *ToX = FirstDeclMatcher().match( + ToTU, classTemplateDecl(hasName("X"))); + NamedDecl *ToParm = ToX->getTemplateParameters()->getParam(0); + DeclContext *OldPrimaryDC = ToX->getTemplatedDecl()->getPrimaryContext(); + ASSERT_EQ(ToParm->getDeclContext(), ToX->getTemplatedDecl()); + ASSERT_EQ(SharedStatePtr->getLookupTable() + ->lookup(ToX->getTemplatedDecl(), ToParm->getDeclName()) + .size(), + 1u); + ASSERT_TRUE(SharedStatePtr->getLookupTable()->contains( + ToX->getTemplatedDecl(), ToParm)); + + const char *Code = + R"( + template + struct X; + template + struct X {}; + )"; + Decl *FromTU = getTuDecl(Code, Lang_CXX11); + auto *FromX = LastDeclMatcher().match( + FromTU, classTemplateDecl(hasName("X"))); + + auto *ImportedX = Import(FromX, Lang_CXX11); + + EXPECT_TRUE(ImportedX); + EXPECT_EQ(ImportedX->getTemplateParameters()->getParam(0)->getDeclContext(), + ImportedX->getTemplatedDecl()); + + // ToX did not change at the import. + // Verify that primary context has changed after import of class definition. + DeclContext *NewPrimaryDC = ToX->getTemplatedDecl()->getPrimaryContext(); + EXPECT_NE(OldPrimaryDC, NewPrimaryDC); + // The lookup table should not be different than it was before. + EXPECT_EQ(SharedStatePtr->getLookupTable() + ->lookup(ToX->getTemplatedDecl(), ToParm->getDeclName()) + .size(), + 1u); + EXPECT_TRUE(SharedStatePtr->getLookupTable()->contains( + ToX->getTemplatedDecl(), ToParm)); +} + TEST_P(ASTImporterOptionSpecificTestBase, ExistingUndeclaredImportDeclaredFriend) { Decl *ToTU = getToTuDecl( From c87952775fffff125ecaf1b695cea994e108a9e8 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 13 Jan 2025 09:05:18 +0000 Subject: [PATCH 019/102] [ci] Handle the case where all reported tests pass but the build is still a failure (#120264) In this build: https://buildkite.com/llvm-project/github-pull-requests/builds/126961 The builds actually failed, probably because prerequisite of a test suite failed to build. However they still ran other tests and all those passed. This meant that the test reports were green even though the build was red. On some level this is technically correct, but it is very misleading in practice. So I've also passed the build script's return code, as it was when we entered the on exit handler, to the generator, so that when this happens again, the report will draw the viewer's attention to the overall failure. There will be a link in the report to the build's log file, so the next step to investigate is clear. It would be nice to say "tests failed and there was some other build error", but we cannot tell what the non-zero return code was caused by. Could be either. The script handles the following situations now: | Have Result Files? | Tests reported failed? | Return code | Report | |--------------------|------------------------|-------------|-----------------------------------------------------------------------------| | Yes | No | 0 | Success style report. | | Yes | Yes | 0 | Shouldn't happen, but if it did, failure style report showing the failures. | | Yes | No | 1 | Failure style report, showing no failures but noting that the build failed. | | Yes | Yes | 1 | Failure style report, showing the test failures. | | No | ? | 0 | No test report, success shown in the normal build display. | | No | ? | 1 | No test report, failure shown in the normal build display. | --- .ci/generate_test_report.py | 102 +++++++++++++++++++++++++++++++----- .ci/monolithic-linux.sh | 4 +- .ci/monolithic-windows.sh | 4 +- 3 files changed, 94 insertions(+), 16 deletions(-) diff --git a/.ci/generate_test_report.py b/.ci/generate_test_report.py index ff601a0cde106..6f2137e7803bb 100644 --- a/.ci/generate_test_report.py +++ b/.ci/generate_test_report.py @@ -19,12 +19,13 @@ def junit_from_xml(xml): class TestReports(unittest.TestCase): def test_title_only(self): - self.assertEqual(_generate_report("Foo", []), ("", "success")) + self.assertEqual(_generate_report("Foo", 0, []), ("", "success")) def test_no_tests_in_testsuite(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -45,6 +46,7 @@ def test_no_failures(self): self.assertEqual( _generate_report( "Foo", + 0, [ junit_from_xml( dedent( @@ -70,10 +72,51 @@ def test_no_failures(self): ), ) + def test_no_failures_build_failed(self): + self.assertEqual( + _generate_report( + "Foo", + 1, + [ + junit_from_xml( + dedent( + """\ + + + + + + """ + ) + ) + ], + buildkite_info={ + "BUILDKITE_ORGANIZATION_SLUG": "organization_slug", + "BUILDKITE_PIPELINE_SLUG": "pipeline_slug", + "BUILDKITE_BUILD_NUMBER": "build_number", + "BUILDKITE_JOB_ID": "job_id", + }, + ), + ( + dedent( + """\ + # Foo + + * 1 test passed + + All tests passed but another part of the build **failed**. + + [Download](https://buildkite.com/organizations/organization_slug/pipelines/pipeline_slug/builds/build_number/jobs/job_id/download.txt) the build's log file to see the details.""" + ), + "error", + ), + ) + def test_report_single_file_single_testsuite(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -166,6 +209,7 @@ def test_report_single_file_multiple_testsuites(self): self.assertEqual( _generate_report( "ABC and DEF", + 1, [ junit_from_xml( dedent( @@ -198,6 +242,7 @@ def test_report_multiple_files_multiple_testsuites(self): self.assertEqual( _generate_report( "ABC and DEF", + 1, [ junit_from_xml( dedent( @@ -238,6 +283,7 @@ def test_report_dont_list_failures(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -272,6 +318,7 @@ def test_report_dont_list_failures_link_to_log(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -312,6 +359,7 @@ def test_report_size_limit(self): self.assertEqual( _generate_report( "Foo", + 1, [ junit_from_xml( dedent( @@ -351,12 +399,18 @@ def test_report_size_limit(self): # and output will not be. def _generate_report( title, + return_code, junit_objects, size_limit=1024 * 1024, list_failures=True, buildkite_info=None, ): if not junit_objects: + # Note that we do not post an empty report, therefore we can ignore a + # non-zero return code in situations like this. + # + # If we were going to post a report, then yes, it would be misleading + # to say we succeeded when the final return code was non-zero. return ("", "success") failures = {} @@ -385,7 +439,11 @@ def _generate_report( if not tests_run: return ("", None) - style = "error" if tests_failed else "success" + style = "success" + # Either tests failed, or all tests passed but something failed to build. + if tests_failed or return_code != 0: + style = "error" + report = [f"# {title}", ""] tests_passed = tests_run - tests_skipped - tests_failed @@ -400,17 +458,17 @@ def plural(num_tests): if tests_failed: report.append(f"* {tests_failed} {plural(tests_failed)} failed") - if not list_failures: - if buildkite_info is not None: - log_url = ( - "https://buildkite.com/organizations/{BUILDKITE_ORGANIZATION_SLUG}/" - "pipelines/{BUILDKITE_PIPELINE_SLUG}/builds/{BUILDKITE_BUILD_NUMBER}/" - "jobs/{BUILDKITE_JOB_ID}/download.txt".format(**buildkite_info) - ) - download_text = f"[Download]({log_url})" - else: - download_text = "Download" + if buildkite_info is not None: + log_url = ( + "https://buildkite.com/organizations/{BUILDKITE_ORGANIZATION_SLUG}/" + "pipelines/{BUILDKITE_PIPELINE_SLUG}/builds/{BUILDKITE_BUILD_NUMBER}/" + "jobs/{BUILDKITE_JOB_ID}/download.txt".format(**buildkite_info) + ) + download_text = f"[Download]({log_url})" + else: + download_text = "Download" + if not list_failures: report.extend( [ "", @@ -435,11 +493,23 @@ def plural(num_tests): "", ] ) + elif return_code != 0: + # No tests failed but the build was in a failed state. Bring this to the user's + # attention. + report.extend( + [ + "", + "All tests passed but another part of the build **failed**.", + "", + f"{download_text} the build's log file to see the details.", + ] + ) report = "\n".join(report) if len(report.encode("utf-8")) > size_limit: return _generate_report( title, + return_code, junit_objects, size_limit, list_failures=False, @@ -449,9 +519,10 @@ def plural(num_tests): return report, style -def generate_report(title, junit_files, buildkite_info): +def generate_report(title, return_code, junit_files, buildkite_info): return _generate_report( title, + return_code, [JUnitXml.fromfile(p) for p in junit_files], buildkite_info=buildkite_info, ) @@ -463,6 +534,7 @@ def generate_report(title, junit_files, buildkite_info): "title", help="Title of the test report, without Markdown formatting." ) parser.add_argument("context", help="Annotation context to write to.") + parser.add_argument("return_code", help="The build's return code.", type=int) parser.add_argument("junit_files", help="Paths to JUnit report files.", nargs="*") args = parser.parse_args() @@ -477,7 +549,9 @@ def generate_report(title, junit_files, buildkite_info): if len(buildkite_info) != len(env_var_names): buildkite_info = None - report, style = generate_report(args.title, args.junit_files, buildkite_info) + report, style = generate_report( + args.title, args.return_code, args.junit_files, buildkite_info + ) if report: p = subprocess.Popen( diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 4bfebd5f75279..55741bc831046 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -29,6 +29,8 @@ if [[ -n "${CLEAR_CACHE:-}" ]]; then fi function at-exit { + retcode=$? + mkdir -p artifacts ccache --print-stats > artifacts/ccache_stats.txt @@ -37,7 +39,7 @@ function at-exit { if command -v buildkite-agent 2>&1 >/dev/null then python3 "${MONOREPO_ROOT}"/.ci/generate_test_report.py ":linux: Linux x64 Test Results" \ - "linux-x64-test-results" "${BUILD_DIR}"/test-results.*.xml + "linux-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml fi } trap at-exit EXIT diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index 25cdd2f419f47..68303a3ea153a 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -28,6 +28,8 @@ fi sccache --zero-stats function at-exit { + retcode=$? + mkdir -p artifacts sccache --show-stats >> artifacts/sccache_stats.txt @@ -36,7 +38,7 @@ function at-exit { if command -v buildkite-agent 2>&1 >/dev/null then python "${MONOREPO_ROOT}"/.ci/generate_test_report.py ":windows: Windows x64 Test Results" \ - "windows-x64-test-results" "${BUILD_DIR}"/test-results.*.xml + "windows-x64-test-results" $retcode "${BUILD_DIR}"/test-results.*.xml fi } trap at-exit EXIT From 5e3ad2519657fdea1d92901d60f552acaf176d66 Mon Sep 17 00:00:00 2001 From: xiaoleis-nv <99947620+xiaoleis-nv@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:33:05 +0800 Subject: [PATCH 020/102] [MLIR][NVVM] Fix the datatype error for nvvm.mma.sync when the operand is bf16 (#122664) The PR fixes the datatype error for `nvvm.mma.sync` when the operand is `bf16`. This operation originally requires the A/B type to be `f16x2` for the `bf16` MMA. However, it violates the NVVM intrinsic [[here](https://github.com/xiaoleis-nv/llvm-project/blob/372044ee09d39942925824f8f335aef40bfe92f0/llvm/include/llvm/IR/IntrinsicsNVVM.td#L119)], where the A/B operand type should be `i32`. This is a bug, and there are no tests in MLIR that cover this datatype. ``` // mma bf16 -> s32 @ m16n8k16/m16n8k8 !eq(gft,"m16n8k16:a:bf16") : !listsplat(llvm_i32_ty, 4), !eq(gft,"m16n8k16:b:bf16") : !listsplat(llvm_i32_ty, 2), !eq(gft,"m16n8k8:a:bf16") : !listsplat(llvm_i32_ty, 2), !eq(gft,"m16n8k8:b:bf16") : [llvm_i32_ty], ``` This PR addresses this bug and adds tests to guarantee correctness. Co-authored-by: Xiaolei Shi --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 4 ++-- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 7 ++++++- mlir/test/Dialect/LLVMIR/nvvm.mlir | 23 +++++++++++++++++++++ mlir/test/Target/LLVMIR/nvvmir.mlir | 12 +++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 0b9097e9bbca2..04042903e343e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -1699,8 +1699,8 @@ def NVVM_MmaOp : NVVM_Op<"mma.sync", [AttrSizedOperandSegments]> { | f16 | .m8n8k4 | row/col | row/col | 2x f16x2 | 2x f16x2 | 4x f16x2 or 8xf32 | | | .m16n8k8 | row | col | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 | | | .m16n8k16 | row | col | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 | - | bf16 | .m16n8k8 | row | col | 2x f16x2 | 1x f16x2 | 2x f16x2 or 4 f32 | - | | .m16n8k16 | row | col | 4x f16x2 | 2x f16x2 | 2x f16x2 or 4 f32 | + | bf16 | .m16n8k8 | row | col | 2x i32 | 1x i32 | 4x f32 | + | | .m16n8k16 | row | col | 4x i32 | 2x i32 | 4x f32 | | tf32 | .m16n8k4 | row | col | 2x i32 | 1x i32 | 4x f32 | | | .m16n8k8 | row | col | 4x i32 | 2x i32 | 2x f16x2 or 4 f32 | | u8/s8 | .m8n8k16 | row | col | 1x i32 | 1x i32 | 2x i32 | diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 838159d676545..d8fde3e765ac4 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -445,8 +445,13 @@ LogicalResult MmaOp::verify() { expectedResult.push_back(LLVM::LLVMStructType::getLiteral( context, {f32Ty, f32Ty, f32Ty, f32Ty})); break; - case MMATypes::f16: case MMATypes::bf16: + kFactor = 8; + multiplicandFragType = i32Ty; + expectedResult.push_back(LLVM::LLVMStructType::getLiteral( + context, {f32Ty, f32Ty, f32Ty, f32Ty})); + break; + case MMATypes::f16: kFactor = 8; multiplicandFragType = f16x2Ty; expectedResult.push_back(f16x2x2StructTy); diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index a7bdceba01c1e..4c3b6648a41c0 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -163,6 +163,29 @@ func.func @nvvm_mma_m8n8k4_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>, llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> } +// CHECK-LABEL: @nvvm_mma_m16n8k8_bf16_bf16 +func.func @nvvm_mma_m16n8k8_bf16_bf16(%a0 : i32, %a1 : i32, %b0 : i32, + %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) { + // CHECK: nvvm.mma.sync A[{{.*}}] B[{{.*}}] C[{{.*}}] {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3] + {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, + multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, + shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)> +} + +// CHECK-LABEL: @nvvm_mma_m16n8k16_bf16_bf16 +func.func @nvvm_mma_m16n8k16_bf16_bf16(%a0 : i32, %a1 : i32, %a2 : i32, %a3 : i32, + %b0 : i32, %b1 : i32, + %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) { + // CHECK: nvvm.mma.sync A[{{.*}}] B[{{.*}}] C[{{.*}}] {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3] + {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, + multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, + shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)> +} + // CHECK-LABEL: @nvvm_mma_m8n8k16_s8_s8 func.func @nvvm_mma_m8n8k16_s8_s8(%a0 : i32, %b0 : i32, %c0 : i32, %c1 : i32) { diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 2d7710e7cbf27..09e98765413f0 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -291,6 +291,18 @@ llvm.func @nvvm_mma_m16n8k16_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>, llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)> } +// CHECK-LABEL: @nvvm_mma_m16n8k16_bf16_bf16 +llvm.func @nvvm_mma_m16n8k16_bf16_bf16(%a0 : i32, %a1 : i32, %a2 : i32, %a3 : i32, + %b0 : i32, %b1 : i32, + %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(f32, f32, f32, f32)> { + // CHECK: call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.bf16 + %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3] + {layoutA = #nvvm.mma_layout, layoutB = #nvvm.mma_layout, + multiplicandAPtxType = #nvvm.mma_type, multiplicandBPtxType = #nvvm.mma_type, + shape = #nvvm.shape} : (i32, i32, f32) -> !llvm.struct<(f32, f32, f32, f32)> + llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)> +} + // f32 return type, f16 accumulate type // CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16 llvm.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>, From d6708f4bb443e9e928bf891ebe302c86d46dc94a Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Mon, 13 Jan 2025 09:55:08 +0000 Subject: [PATCH 021/102] [MachineCP] Correctly handle register masks and sub-registers (#122472) When passing an instruction with a register mask, the machine copy propagation pass was dropping the information about some copy instructions which define a register which is preserved by the mask, because that register overlaps a register which is partially clobbered by it. This resulted in a miscompilation for AArch64, because this caused a live copy to be considered dead. The fix is to clobber register masks by finding the set of reg units which is preserved by the mask, and clobbering all units not in that set. --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 136 ++++++++++-------- .../CodeGen/AArch64/machine-cp-sub-reg.mir | 32 ++++- 2 files changed, 111 insertions(+), 57 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 49ce4b660c3ae..d2579e2d1b44c 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -164,67 +164,91 @@ class CopyTracker { Copies.erase(Unit); } - /// Clobber a single register, removing it from the tracker's copy maps. - void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI, - const TargetInstrInfo &TII, bool UseCopyInstr) { - for (MCRegUnit Unit : TRI.regunits(Reg)) { - auto I = Copies.find(Unit); - if (I != Copies.end()) { - // When we clobber the source of a copy, we need to clobber everything - // it defined. - markRegsUnavailable(I->second.DefRegs, TRI); - // When we clobber the destination of a copy, we need to clobber the - // whole register it defined. - if (MachineInstr *MI = I->second.MI) { - std::optional CopyOperands = - isCopyInstr(*MI, TII, UseCopyInstr); - - MCRegister Def = CopyOperands->Destination->getReg().asMCReg(); - MCRegister Src = CopyOperands->Source->getReg().asMCReg(); - - markRegsUnavailable(Def, TRI); - - // Since we clobber the destination of a copy, the semantic of Src's - // "DefRegs" to contain Def is no longer effectual. We will also need - // to remove the record from the copy maps that indicates Src defined - // Def. Failing to do so might cause the target to miss some - // opportunities to further eliminate redundant copy instructions. - // Consider the following sequence during the - // ForwardCopyPropagateBlock procedure: - // L1: r0 = COPY r9 <- TrackMI - // L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker) - // L3: use r0 <- Remove L2 from MaybeDeadCopies - // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker) - // L5: r0 = COPY r8 <- Remove NopCopy - for (MCRegUnit SrcUnit : TRI.regunits(Src)) { - auto SrcCopy = Copies.find(SrcUnit); - if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) { - // If SrcCopy defines multiple values, we only need - // to erase the record for Def in DefRegs. - for (auto itr = SrcCopy->second.DefRegs.begin(); - itr != SrcCopy->second.DefRegs.end(); itr++) { - if (*itr == Def) { - SrcCopy->second.DefRegs.erase(itr); - // If DefReg becomes empty after removal, we can remove the - // SrcCopy from the tracker's copy maps. We only remove those - // entries solely record the Def is defined by Src. If an - // entry also contains the definition record of other Def' - // registers, it cannot be cleared. - if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) { - Copies.erase(SrcCopy); - } - break; + /// Clobber a single register unit, removing it from the tracker's copy maps. + void clobberRegUnit(MCRegUnit Unit, const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII, bool UseCopyInstr) { + auto I = Copies.find(Unit); + if (I != Copies.end()) { + // When we clobber the source of a copy, we need to clobber everything + // it defined. + markRegsUnavailable(I->second.DefRegs, TRI); + // When we clobber the destination of a copy, we need to clobber the + // whole register it defined. + if (MachineInstr *MI = I->second.MI) { + std::optional CopyOperands = + isCopyInstr(*MI, TII, UseCopyInstr); + + MCRegister Def = CopyOperands->Destination->getReg().asMCReg(); + MCRegister Src = CopyOperands->Source->getReg().asMCReg(); + + markRegsUnavailable(Def, TRI); + + // Since we clobber the destination of a copy, the semantic of Src's + // "DefRegs" to contain Def is no longer effectual. We will also need + // to remove the record from the copy maps that indicates Src defined + // Def. Failing to do so might cause the target to miss some + // opportunities to further eliminate redundant copy instructions. + // Consider the following sequence during the + // ForwardCopyPropagateBlock procedure: + // L1: r0 = COPY r9 <- TrackMI + // L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker) + // L3: use r0 <- Remove L2 from MaybeDeadCopies + // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker) + // L5: r0 = COPY r8 <- Remove NopCopy + for (MCRegUnit SrcUnit : TRI.regunits(Src)) { + auto SrcCopy = Copies.find(SrcUnit); + if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) { + // If SrcCopy defines multiple values, we only need + // to erase the record for Def in DefRegs. + for (auto itr = SrcCopy->second.DefRegs.begin(); + itr != SrcCopy->second.DefRegs.end(); itr++) { + if (*itr == Def) { + SrcCopy->second.DefRegs.erase(itr); + // If DefReg becomes empty after removal, we can remove the + // SrcCopy from the tracker's copy maps. We only remove those + // entries solely record the Def is defined by Src. If an + // entry also contains the definition record of other Def' + // registers, it cannot be cleared. + if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) { + Copies.erase(SrcCopy); } + break; } } } } - // Now we can erase the copy. - Copies.erase(I); } + // Now we can erase the copy. + Copies.erase(I); } } + /// Clobber a single register, removing it from the tracker's copy maps. + void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII, bool UseCopyInstr) { + for (MCRegUnit Unit : TRI.regunits(Reg)) { + clobberRegUnit(Unit, TRI, TII, UseCopyInstr); + } + } + + /// Clobber all registers which are not preserved by RegMask, removing them + /// from the tracker's copy maps. + void clobberRegistersExceptMask(const MachineOperand *RegMask, + const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII, + bool UseCopyInstr) { + BitVector SafeRegUnits(TRI.getNumRegUnits()); + + for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg) + if (!RegMask->clobbersPhysReg(SafeReg)) + for (auto SafeUnit : TRI.regunits(SafeReg)) + SafeRegUnits.set(SafeUnit); + + for (unsigned Unit = 0, E = TRI.getNumRegUnits(); Unit < E; ++Unit) + if (!SafeRegUnits.test(Unit)) + clobberRegUnit(Unit, TRI, TII, UseCopyInstr); + } + /// Track copy's src users, and return false if that can't be done. /// We can only track if we have a COPY instruction which source is /// the same as the Reg. @@ -960,6 +984,10 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { // a large set of registers. Treat clobbered registers the same way as // defined registers. if (RegMask) { + // Invalidate all entries in the copy map which are not preserved by this + // register mask. + Tracker.clobberRegistersExceptMask(RegMask, *TRI, *TII, UseCopyInstr); + // Erase any MaybeDeadCopies whose destination register is clobbered. for (SmallSetVector::iterator DI = MaybeDeadCopies.begin(); @@ -978,10 +1006,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: "; MaybeDead->dump()); - // Make sure we invalidate any entries in the copy maps before erasing - // the instruction. - Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr); - // erase() will return the next valid iterator pointing to the next // element after the erased one. DI = MaybeDeadCopies.erase(DI); diff --git a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir index 5b379c2bd5629..e7865569c75bd 100644 --- a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir +++ b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir @@ -1,5 +1,16 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos --verify-machineinstrs | FileCheck %s +# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos | FileCheck %s + +--- | + declare void @foo() + + define void @test() { + unreachable + } + define void @test2() { + unreachable + } +... --- name: test @@ -30,3 +41,22 @@ body: | RET undef $lr, implicit $x0 ... +--- +name: test2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $q14, $d29, $x0, $x1 + ; CHECK-LABEL: name: test2 + ; CHECK: liveins: $q14, $d29, $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $d8 = COPY killed renamable $d29 + ; CHECK-NEXT: BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0 + ; CHECK-NEXT: RET_ReallyLR implicit $b0 + renamable $q8 = COPY renamable $q14 + renamable $d8 = COPY killed renamable $d29 + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0 + RET_ReallyLR implicit $b0 +... From 9c6dad04e6bf264b5a17373c96426dc6db9f3483 Mon Sep 17 00:00:00 2001 From: xtex Date: Mon, 13 Jan 2025 10:12:23 +0000 Subject: [PATCH 022/102] Revert "[clang] Canonicalize absolute paths in dependency file" (#121638) Reverts llvm/llvm-project#117458 https://github.com/llvm/llvm-project/pull/117458#issuecomment-2568804774 https://github.com/ninja-build/ninja/issues/2528 --- clang/include/clang/Frontend/Utils.h | 1 - clang/lib/Frontend/DependencyFile.cpp | 20 +++---------------- clang/test/Frontend/dependency-gen-symlink.c | 2 +- .../dependency-gen-windows-duplicates.c | 2 +- clang/test/VFS/external-names.c | 2 +- 5 files changed, 6 insertions(+), 21 deletions(-) diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h index 8ed17179c9824..604e42067a3f1 100644 --- a/clang/include/clang/Frontend/Utils.h +++ b/clang/include/clang/Frontend/Utils.h @@ -120,7 +120,6 @@ class DependencyFileGenerator : public DependencyCollector { private: void outputDependencyFile(DiagnosticsEngine &Diags); - llvm::IntrusiveRefCntPtr FS; std::string OutputFile; std::vector Targets; bool IncludeSystemHeaders; diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp index 8a36d835d82b3..15fa7de35df97 100644 --- a/clang/lib/Frontend/DependencyFile.cpp +++ b/clang/lib/Frontend/DependencyFile.cpp @@ -23,10 +23,8 @@ #include "llvm/ADT/StringSet.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" -#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include -#include using namespace clang; @@ -238,7 +236,6 @@ void DependencyFileGenerator::attachToPreprocessor(Preprocessor &PP) { PP.SetSuppressIncludeNotFoundError(true); DependencyCollector::attachToPreprocessor(PP); - FS = PP.getFileManager().getVirtualFileSystemPtr(); } bool DependencyFileGenerator::sawDependency(StringRef Filename, bool FromModule, @@ -315,22 +312,11 @@ void DependencyFileGenerator::finishedMainFile(DiagnosticsEngine &Diags) { /// https://msdn.microsoft.com/en-us/library/dd9y37ha.aspx for NMake info, /// https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx /// for Windows file-naming info. -static void printFilename(raw_ostream &OS, llvm::vfs::FileSystem *FS, - StringRef Filename, +static void PrintFilename(raw_ostream &OS, StringRef Filename, DependencyOutputFormat OutputFormat) { // Convert filename to platform native path llvm::SmallString<256> NativePath; llvm::sys::path::native(Filename.str(), NativePath); - // Resolve absolute path. Make and Ninja canonicalize paths - // without checking for symbolic links in the path, for performance concerns. - // If there is something like `/bin/../lib64` -> `/usr/lib64` - // (where `/bin` links to `/usr/bin`), Make will see them as `/lib64`. - if (FS != nullptr && llvm::sys::path::is_absolute(NativePath)) { - llvm::SmallString<256> NativePathTmp = NativePath; - std::error_code EC = FS->getRealPath(NativePathTmp, NativePath); - if (EC) - NativePath = NativePathTmp; - } if (OutputFormat == DependencyOutputFormat::NMake) { // Add quotes if needed. These are the characters listed as "special" to @@ -414,7 +400,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) { Columns = 2; } OS << ' '; - printFilename(OS, FS.get(), File, OutputFormat); + PrintFilename(OS, File, OutputFormat); Columns += N + 1; } OS << '\n'; @@ -425,7 +411,7 @@ void DependencyFileGenerator::outputDependencyFile(llvm::raw_ostream &OS) { for (auto I = Files.begin(), E = Files.end(); I != E; ++I) { if (Index++ == InputFileIndex) continue; - printFilename(OS, FS.get(), *I, OutputFormat); + PrintFilename(OS, *I, OutputFormat); OS << ":\n"; } } diff --git a/clang/test/Frontend/dependency-gen-symlink.c b/clang/test/Frontend/dependency-gen-symlink.c index 15664a46b90c8..2fa339ad2abf2 100644 --- a/clang/test/Frontend/dependency-gen-symlink.c +++ b/clang/test/Frontend/dependency-gen-symlink.c @@ -15,7 +15,7 @@ // CHECK: dependency-gen-symlink.c.o // CHECK: dependency-gen-symlink.c // CHECK: a/header.h -// CHECK-NOT: b/header.h +// CHECK: b/header.h // CHECK-NOT: with-header-guard.h #include "a/header.h" #include "b/header.h" diff --git a/clang/test/Frontend/dependency-gen-windows-duplicates.c b/clang/test/Frontend/dependency-gen-windows-duplicates.c index 0ecc23226fb9c..abd351377dc33 100644 --- a/clang/test/Frontend/dependency-gen-windows-duplicates.c +++ b/clang/test/Frontend/dependency-gen-windows-duplicates.c @@ -9,7 +9,7 @@ // RUN: %clang -MD -MF - %t.dir/test.c -fsyntax-only -I %t.dir/subdir | FileCheck %s // CHECK: test.o: // CHECK-NEXT: \test.c -// CHECK-NEXT: \subdir\x.h +// CHECK-NEXT: \SubDir\X.h // File x.h must appear only once (case insensitive check). // CHECK-NOT: {{\\|/}}{{x|X}}.{{h|H}} diff --git a/clang/test/VFS/external-names.c b/clang/test/VFS/external-names.c index dd0b5eb501840..5b7c443b36e56 100644 --- a/clang/test/VFS/external-names.c +++ b/clang/test/VFS/external-names.c @@ -47,4 +47,4 @@ // RUN: %clang_cc1 -D REINCLUDE -I %t -ivfsoverlay %t.yaml -Eonly %s -MTfoo -dependency-file %t.dep // RUN: cat %t.dep | FileCheck --check-prefix=CHECK-DEP %s -// CHECK-DEP: Inputs{{..?}}external-names.h +// CHECK-DEP-NOT: Inputs From d41b701bed47e37d28ca42cac6436a13776483cb Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 13 Jan 2025 11:24:02 +0100 Subject: [PATCH 023/102] [Clang] Add release note for pointer overflow optimization change (#122462) Add a release note for optimization change related to pointer overflow checks. I've put this in the breaking changes section to give it the best chance of being seen. --- clang/docs/ReleaseNotes.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a14fb189c8e13..8f4adbcd70518 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -58,6 +58,29 @@ code bases. containing strict-aliasing violations. The new default behavior can be disabled using ``-fno-pointer-tbaa``. +- Clang will now more aggressively use undefined behavior on pointer addition + overflow for optimization purposes. For example, a check like + ``ptr + unsigned_offset < ptr`` will now optimize to ``false``, because + ``ptr + unsigned_offset`` will cause undefined behavior if it overflows (or + advances past the end of the object). + + Previously, ``ptr + unsigned_offset < ptr`` was optimized (by both Clang and + GCC) to ``(ssize_t)unsigned_offset < 0``. This also results in an incorrect + overflow check, but in a way that is less apparent when only testing with + pointers in the low half of the address space. + + To avoid pointer addition overflow, it is necessary to perform the addition + on integers, for example using + ``(uintptr_t)ptr + unsigned_offset < (uintptr_t)ptr``. Sometimes, it is also + possible to rewrite checks by only comparing the offset. For example, + ``ptr + offset < end_ptr && ptr + offset >= ptr`` can be written as + ``offset < (uintptr_t)(end_ptr - ptr)``. + + Undefined behavior due to pointer addition overflow can be reliably detected + using ``-fsanitize=pointer-overflow``. It is also possible to use + ``-fno-strict-overflow`` to opt-in to a language dialect where signed integer + and pointer overflow are well-defined. + C/C++ Language Potentially Breaking Changes ------------------------------------------- From 4942343ac8862adf576de5e9c9e60efeaa035808 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Mon, 13 Jan 2025 11:27:23 +0100 Subject: [PATCH 024/102] [flang][OpenMP] Fix `omp-declarative-allocate-align.f90` expectations (#122675) The test was effectively a no-op since we used `//` instead of `!` for `RUN` and `CHECK` lines. Also, we have to specify the proper OpenMP version. --- .../test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 index d0ed0cbb4c831..8daf20e1ae400 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 @@ -1,10 +1,10 @@ ! This test checks lowering of OpenMP allocate Directive with align clause. -// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s +! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s 2>&1 | FileCheck %s program main integer :: x - // CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OpenMPDeclarativeAllocate !$omp allocate(x) align(32) end From 3a52d3ea7c4d0b8813d456147e0cec200fe6f0fb Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 13 Jan 2025 10:30:55 +0000 Subject: [PATCH 025/102] [TableGen] Use assert instead of PrintFatalError in TGLexer. NFC. (#122303) Do not use the PrintFatalError diagnostic machinery for conditions that can never happen with any input. --- llvm/lib/TableGen/TGLexer.cpp | 63 +++++++++++++---------------------- llvm/lib/TableGen/TGLexer.h | 5 ++- 2 files changed, 25 insertions(+), 43 deletions(-) diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index e23aec6efba59..c423023077cd8 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -235,8 +235,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { return tgtok::dot; case '\r': - PrintFatalError("getNextChar() must never return '\r'"); - return tgtok::Error; + llvm_unreachable("getNextChar() must never return '\r'"); case ' ': case '\t': @@ -664,11 +663,10 @@ bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { PrepIncludeStack.pop_back(); if (IncludeStackMustBeEmpty) { - if (!PrepIncludeStack.empty()) - PrintFatalError("preprocessor include stack is not empty"); + assert(PrepIncludeStack.empty() && + "preprocessor include stack is not empty"); } else { - if (PrepIncludeStack.empty()) - PrintFatalError("preprocessor include stack is empty"); + assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty"); } return true; @@ -718,27 +716,25 @@ tgtok::TokKind TGLexer::prepIsDirective() const { return tgtok::Error; } -bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { +void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { TokStart = CurPtr; - for (const auto [PKind, PWord] : PreprocessorDirs) + for (const auto [PKind, PWord] : PreprocessorDirs) { if (PKind == Kind) { // Advance CurPtr to the end of the preprocessing word. CurPtr += PWord.size(); - return true; + return; } + } - PrintFatalError("unsupported preprocessing token in " - "prepEatPreprocessorDirective()"); - return false; + llvm_unreachable( + "unsupported preprocessing token in prepEatPreprocessorDirective()"); } tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, bool ReturnNextLiveToken) { // We must be looking at a preprocessing directive. Eat it! - if (!prepEatPreprocessorDirective(Kind)) - PrintFatalError("lexPreprocessor() called for unknown " - "preprocessor directive"); + prepEatPreprocessorDirective(Kind); if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { StringRef MacroName = prepLexMacroName(); @@ -820,11 +816,9 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); - if (IfdefOrElseEntry.Kind != tgtok::Ifdef && - IfdefOrElseEntry.Kind != tgtok::Else) { - PrintFatalError("invalid preprocessor control on the stack"); - return tgtok::Error; - } + assert((IfdefOrElseEntry.Kind == tgtok::Ifdef || + IfdefOrElseEntry.Kind == tgtok::Else) && + "invalid preprocessor control on the stack"); if (!prepSkipDirectiveEnd()) return ReturnError(CurPtr, "only comments are supported after #endif"); @@ -852,21 +846,17 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, return ReturnError(CurPtr, "only comments are supported after #define NAME"); - if (!ReturnNextLiveToken) { - PrintFatalError("#define must be ignored during the lines skipping"); - return tgtok::Error; - } + assert(ReturnNextLiveToken && + "#define must be ignored during the lines skipping"); return LexToken(); } - PrintFatalError("preprocessing directive is not supported"); - return tgtok::Error; + llvm_unreachable("preprocessing directive is not supported"); } bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { - if (!MustNeverBeFalse) - PrintFatalError("invalid recursion."); + assert(MustNeverBeFalse && "invalid recursion."); do { // Skip all symbols to the line end. @@ -902,20 +892,17 @@ bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { if (ProcessedKind == tgtok::Error) return false; - if (Kind != ProcessedKind) - PrintFatalError("prepIsDirective() and lexPreprocessor() " - "returned different token kinds"); + assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() " + "returned different token kinds"); // If this preprocessing directive enables tokens processing, // then return to the lexPreprocessor() and get to the next token. // We can move from line-skipping mode to processing tokens only // due to #else or #endif. if (prepIsProcessingEnabled()) { - if (Kind != tgtok::Else && Kind != tgtok::Endif) { - PrintFatalError("tokens processing was enabled by an unexpected " - "preprocessing directive"); - return false; - } + assert((Kind == tgtok::Else || Kind == tgtok::Endif) && + "tokens processing was enabled by an unexpected preprocessing " + "directive"); return true; } @@ -1053,10 +1040,6 @@ bool TGLexer::prepIsProcessingEnabled() { } void TGLexer::prepReportPreprocessorStackError() { - if (PrepIncludeStack.back().empty()) - PrintFatalError("prepReportPreprocessorStackError() called with " - "empty control stack"); - auto &PrepControl = PrepIncludeStack.back().back(); PrintError(CurBuf.end(), "reached EOF without matching #endif"); PrintError(PrepControl.SrcPos, "the latest preprocessor control is here"); diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index f8b32dc5377f5..bac583c4e33a1 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -347,14 +347,13 @@ class TGLexer { tgtok::TokKind prepIsDirective() const; // Given a preprocessing token kind, adjusts CurPtr to the end - // of the preprocessing directive word. Returns true, unless - // an unsupported token kind is passed in. + // of the preprocessing directive word. // // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() // to avoid adjusting CurPtr before we are sure that '#' is followed // by a preprocessing directive. If it is not, then we fall back to // tgtok::paste interpretation of '#'. - bool prepEatPreprocessorDirective(tgtok::TokKind Kind); + void prepEatPreprocessorDirective(tgtok::TokKind Kind); // The main "exit" point from the token parsing to preprocessor. // From 2a4c6a86b28275287881aa04a194005f1e6684b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Mon, 13 Jan 2025 11:35:38 +0100 Subject: [PATCH 026/102] [clang][ASTImporter] Fix unused variable warning (NFC) (#122686) --- clang/unittests/AST/ASTImporterTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index a0aaad6082d8c..791248e7a394f 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -10182,7 +10182,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch1) { struct X { int A; }; } )"; - Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + getToTuDecl(ToCode, Lang_CXX11); const char *Code = R"( namespace a { @@ -10205,7 +10205,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportIntoReopenedNamespaceNoMatch2) { namespace a { } )"; - Decl *ToTU = getToTuDecl(ToCode, Lang_CXX11); + getToTuDecl(ToCode, Lang_CXX11); const char *Code = R"( namespace a { From c97a19ca63cc396f476ae5e304b23d834aec7b8a Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Mon, 13 Jan 2025 16:17:42 +0530 Subject: [PATCH 027/102] [NVPTX] Add float to tf32 conversion intrinsics (#121507) This patch adds the missing variants of float to tf32 conversion intrinsics, with their corresponding lit tests. PTX Spec link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt Signed-off-by: Durgadoss R --- llvm/include/llvm/IR/IntrinsicsNVVM.td | 10 ++++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 ++++++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 5 -- llvm/test/CodeGen/NVPTX/convert-sm89.ll | 7 +++ llvm/test/CodeGen/NVPTX/convert-sm90.ll | 68 ++++++++++++++++++++++++ 5 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm90.ll diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index ae04a130bc825..00a76018d8415 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1438,6 +1438,16 @@ let TargetPrefix = "nvvm" in { def int_nvvm_f2tf32_rna : ClangBuiltin<"__nvvm_f2tf32_rna">, Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rna_satfinite : ClangBuiltin<"__nvvm_f2tf32_rna_satfinite">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rn : ClangBuiltin<"__nvvm_f2tf32_rn">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">, + Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">, Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index c3e72d6ce3a3f..6a95d9ebef6c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -725,6 +725,23 @@ let hasSideEffects = false in { def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">; def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">; + + // Float to TF32 conversions + multiclass CVT_TO_TF32 Preds = [hasPTX<78>, hasSM<90>]> { + defvar Intr = !cast("int_nvvm_f2tf32_" # !subst(".", "_", Modifier)); + + def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src), + "cvt." # Modifier # ".tf32.f32 \t$dst, $src;", + [(set i32:$dst, (Intr f32:$src))]>, + Requires; + } + + defm CVT_to_tf32_rn : CVT_TO_TF32<"rn">; + defm CVT_to_tf32_rz : CVT_TO_TF32<"rz">; + defm CVT_to_tf32_rn_relu : CVT_TO_TF32<"rn.relu">; + defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">; + defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>; + defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>; } def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{ diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 22339ebc5484f..4f144cc641080 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1722,11 +1722,6 @@ def : Pat<(int_nvvm_f2bf16_rz f32:$a), def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a), (CVT_bf16_f32 $a, CvtRZ_RELU)>; -def CVT_tf32_f32 : - NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a), - "cvt.rna.tf32.f32 \t$dest, $a;", - [(set i32:$dest, (int_nvvm_f2tf32_rna f32:$a))]>; - def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};", Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>; diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll index 5d0576aebbe08..30fd76f5a31c2 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll @@ -84,3 +84,10 @@ define <2 x half> @cvt_rn_relu_f16x2_e5m2x2(i16 %in) { %val = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %in); ret <2 x half> %val } + +; CHECK-LABEL: cvt_rna_satfinite_tf32_f32 +define i32 @cvt_rna_satfinite_tf32_f32(float %f1) { +; CHECK: cvt.rna.satfinite.tf32.f32 + %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1) + ret i32 %val +} diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll new file mode 100644 index 0000000000000..5f610e0e91f88 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} + +declare i32 @llvm.nvvm.f2tf32.rn(float %f1) +declare i32 @llvm.nvvm.f2tf32.rn.relu(float %f1) +declare i32 @llvm.nvvm.f2tf32.rz(float %f1) +declare i32 @llvm.nvvm.f2tf32.rz.relu(float %f1) + +define i32 @cvt_rn_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rn_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rn.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rn(float %f1) + ret i32 %val +} + +define i32 @cvt_rn_relu_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rn_relu_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rn.relu.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rn.relu(float %f1) + ret i32 %val +} + +define i32 @cvt_rz_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rz_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rz.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rz(float %f1) + ret i32 %val +} + +define i32 @cvt_rz_relu_tf32_f32(float %f1) { +; CHECK-LABEL: cvt_rz_relu_tf32_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0]; +; CHECK-NEXT: cvt.rz.relu.tf32.f32 %r1, %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i32 @llvm.nvvm.f2tf32.rz.relu(float %f1) + ret i32 %val +} From adc92c80a72bf4af85fa3c4adf7ee887039a16a5 Mon Sep 17 00:00:00 2001 From: vfdev Date: Mon, 13 Jan 2025 12:00:31 +0100 Subject: [PATCH 028/102] Enabled freethreading support in MLIR python bindings (#122684) Reland reverted https://github.com/llvm/llvm-project/pull/107103 with the fixes for Python 3.8 cc @jpienaar Co-authored-by: Peter Hawkins --- mlir/cmake/modules/AddMLIRPython.cmake | 21 +- mlir/docs/Bindings/Python.md | 40 ++ .../python/StandaloneExtensionPybind11.cpp | 4 +- mlir/lib/Bindings/Python/Globals.h | 12 +- mlir/lib/Bindings/Python/IRCore.cpp | 31 +- mlir/lib/Bindings/Python/IRModule.cpp | 18 +- mlir/lib/Bindings/Python/IRModule.h | 1 + mlir/lib/Bindings/Python/MainModule.cpp | 9 +- mlir/python/requirements.txt | 3 +- mlir/test/python/multithreaded_tests.py | 518 ++++++++++++++++++ 10 files changed, 640 insertions(+), 17 deletions(-) create mode 100644 mlir/test/python/multithreaded_tests.py diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake index 717a503468a85..0679db9cf93e1 100644 --- a/mlir/cmake/modules/AddMLIRPython.cmake +++ b/mlir/cmake/modules/AddMLIRPython.cmake @@ -668,12 +668,31 @@ function(add_mlir_python_extension libname extname) elseif(ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "nanobind") nanobind_add_module(${libname} NB_DOMAIN mlir + FREE_THREADED ${ARG_SOURCES} ) if (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL) # Avoids warnings from upstream nanobind. - target_compile_options(nanobind-static + set(nanobind_target "nanobind-static") + if (NOT TARGET ${nanobind_target}) + # Get correct nanobind target name: nanobind-static-ft or something else + # It is set by nanobind_add_module function according to the passed options + get_property(all_targets DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY BUILDSYSTEM_TARGETS) + + # Iterate over the list of targets + foreach(target ${all_targets}) + # Check if the target name matches the given string + if("${target}" MATCHES "nanobind-") + set(nanobind_target "${target}") + endif() + endforeach() + + if (NOT TARGET ${nanobind_target}) + message(FATAL_ERROR "Could not find nanobind target to set compile options to") + endif() + endif() + target_compile_options(${nanobind_target} PRIVATE -Wno-cast-qual -Wno-zero-length-array diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md index 32df3310d811d..b8bd0f507a510 100644 --- a/mlir/docs/Bindings/Python.md +++ b/mlir/docs/Bindings/Python.md @@ -1187,3 +1187,43 @@ or nanobind and utilities to connect to the rest of Python API. The bindings can be located in a separate module or in the same module as attributes and types, and loaded along with the dialect. + +## Free-threading (No-GIL) support + +Free-threading or no-GIL support refers to CPython interpreter (>=3.13) with Global Interpreter Lock made optional. For details on the topic, please check [PEP-703](https://peps.python.org/pep-0703/) and this [Python free-threading guide](https://py-free-threading.github.io/). + +MLIR Python bindings are free-threading compatible with exceptions (discussed below) in the following sense: it is safe to work in multiple threads with **independent** contexts. Below we show an example code of safe usage: + +```python +# python3.13t example.py +import concurrent.futures + +import mlir.dialects.arith as arith +from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint + + +def func(py_value): + with Context() as ctx: + module = Module.create(loc=Location.file("foo.txt", 0, 0)) + + dtype = IntegerType.get_signless(64) + with InsertionPoint(module.body), Location.name("a"): + arith.constant(dtype, py_value) + + return module + + +num_workers = 8 +with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for i in range(num_workers): + futures.append(executor.submit(func, i)) + assert len(list(f.result() for f in futures)) == num_workers +``` + +The exceptions to the free-threading compatibility: +- IR printing is unsafe, e.g. when using `PassManager` with `PassManager.enable_ir_printing()` which calls thread-unsafe `llvm::raw_ostream`. +- Usage of `Location.emit_error` is unsafe (due to thread-unsafe `llvm::raw_ostream`). +- Usage of `Module.dump` is unsafe (due to thread-unsafe `llvm::raw_ostream`). +- Usage of `mlir.dialects.transform.interpreter` is unsafe. +- Usage of `mlir.dialects.gpu` and `gpu-module-to-binary` is unsafe. \ No newline at end of file diff --git a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp index 397db4c20e743..dd3c4c2945cca 100644 --- a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp +++ b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp @@ -12,9 +12,11 @@ #include "Standalone-c/Dialects.h" #include "mlir/Bindings/Python/PybindAdaptors.h" +namespace py = pybind11; + using namespace mlir::python::adaptors; -PYBIND11_MODULE(_standaloneDialectsPybind11, m) { +PYBIND11_MODULE(_standaloneDialectsPybind11, m, py::mod_gil_not_used()) { //===--------------------------------------------------------------------===// // standalone dialect //===--------------------------------------------------------------------===// diff --git a/mlir/lib/Bindings/Python/Globals.h b/mlir/lib/Bindings/Python/Globals.h index 0ec522d14f74b..826a34a535176 100644 --- a/mlir/lib/Bindings/Python/Globals.h +++ b/mlir/lib/Bindings/Python/Globals.h @@ -24,6 +24,7 @@ namespace mlir { namespace python { /// Globals that are always accessible once the extension has been initialized. +/// Methods of this class are thread-safe. class PyGlobals { public: PyGlobals(); @@ -37,12 +38,18 @@ class PyGlobals { /// Get and set the list of parent modules to search for dialect /// implementation classes. - std::vector &getDialectSearchPrefixes() { + std::vector getDialectSearchPrefixes() { + nanobind::ft_lock_guard lock(mutex); return dialectSearchPrefixes; } void setDialectSearchPrefixes(std::vector newValues) { + nanobind::ft_lock_guard lock(mutex); dialectSearchPrefixes.swap(newValues); } + void addDialectSearchPrefix(std::string value) { + nanobind::ft_lock_guard lock(mutex); + dialectSearchPrefixes.push_back(std::move(value)); + } /// Loads a python module corresponding to the given dialect namespace. /// No-ops if the module has already been loaded or is not found. Raises @@ -109,6 +116,9 @@ class PyGlobals { private: static PyGlobals *instance; + + nanobind::ft_mutex mutex; + /// Module name prefixes to search under for dialect implementation modules. std::vector dialectSearchPrefixes; /// Map of dialect namespace to external dialect class object. diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 453d4f7c7e8bc..463ebdebb3f3f 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -243,9 +243,15 @@ static MlirBlock createBlock(const nb::sequence &pyArgTypes, /// Wrapper for the global LLVM debugging flag. struct PyGlobalDebugFlag { - static void set(nb::object &o, bool enable) { mlirEnableGlobalDebug(enable); } + static void set(nb::object &o, bool enable) { + nb::ft_lock_guard lock(mutex); + mlirEnableGlobalDebug(enable); + } - static bool get(const nb::object &) { return mlirIsGlobalDebugEnabled(); } + static bool get(const nb::object &) { + nb::ft_lock_guard lock(mutex); + return mlirIsGlobalDebugEnabled(); + } static void bind(nb::module_ &m) { // Debug flags. @@ -255,6 +261,7 @@ struct PyGlobalDebugFlag { .def_static( "set_types", [](const std::string &type) { + nb::ft_lock_guard lock(mutex); mlirSetGlobalDebugType(type.c_str()); }, "types"_a, "Sets specific debug types to be produced by LLVM") @@ -263,11 +270,17 @@ struct PyGlobalDebugFlag { pointers.reserve(types.size()); for (const std::string &str : types) pointers.push_back(str.c_str()); + nb::ft_lock_guard lock(mutex); mlirSetGlobalDebugTypes(pointers.data(), pointers.size()); }); } + +private: + static nb::ft_mutex mutex; }; +nb::ft_mutex PyGlobalDebugFlag::mutex; + struct PyAttrBuilderMap { static bool dunderContains(const std::string &attributeKind) { return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value(); @@ -606,6 +619,7 @@ class PyOpOperandIterator { PyMlirContext::PyMlirContext(MlirContext context) : context(context) { nb::gil_scoped_acquire acquire; + nb::ft_lock_guard lock(live_contexts_mutex); auto &liveContexts = getLiveContexts(); liveContexts[context.ptr] = this; } @@ -615,7 +629,10 @@ PyMlirContext::~PyMlirContext() { // forContext method, which always puts the associated handle into // liveContexts. nb::gil_scoped_acquire acquire; - getLiveContexts().erase(context.ptr); + { + nb::ft_lock_guard lock(live_contexts_mutex); + getLiveContexts().erase(context.ptr); + } mlirContextDestroy(context); } @@ -632,6 +649,7 @@ nb::object PyMlirContext::createFromCapsule(nb::object capsule) { PyMlirContextRef PyMlirContext::forContext(MlirContext context) { nb::gil_scoped_acquire acquire; + nb::ft_lock_guard lock(live_contexts_mutex); auto &liveContexts = getLiveContexts(); auto it = liveContexts.find(context.ptr); if (it == liveContexts.end()) { @@ -647,12 +665,17 @@ PyMlirContextRef PyMlirContext::forContext(MlirContext context) { return PyMlirContextRef(it->second, std::move(pyRef)); } +nb::ft_mutex PyMlirContext::live_contexts_mutex; + PyMlirContext::LiveContextMap &PyMlirContext::getLiveContexts() { static LiveContextMap liveContexts; return liveContexts; } -size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); } +size_t PyMlirContext::getLiveCount() { + nb::ft_lock_guard lock(live_contexts_mutex); + return getLiveContexts().size(); +} size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); } diff --git a/mlir/lib/Bindings/Python/IRModule.cpp b/mlir/lib/Bindings/Python/IRModule.cpp index f7bf77e5a7e04..e600f1bbd4493 100644 --- a/mlir/lib/Bindings/Python/IRModule.cpp +++ b/mlir/lib/Bindings/Python/IRModule.cpp @@ -38,8 +38,11 @@ PyGlobals::PyGlobals() { PyGlobals::~PyGlobals() { instance = nullptr; } bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) { - if (loadedDialectModules.contains(dialectNamespace)) - return true; + { + nb::ft_lock_guard lock(mutex); + if (loadedDialectModules.contains(dialectNamespace)) + return true; + } // Since re-entrancy is possible, make a copy of the search prefixes. std::vector localSearchPrefixes = dialectSearchPrefixes; nb::object loaded = nb::none(); @@ -62,12 +65,14 @@ bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) { return false; // Note: Iterator cannot be shared from prior to loading, since re-entrancy // may have occurred, which may do anything. + nb::ft_lock_guard lock(mutex); loadedDialectModules.insert(dialectNamespace); return true; } void PyGlobals::registerAttributeBuilder(const std::string &attributeKind, nb::callable pyFunc, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = attributeBuilderMap[attributeKind]; if (found && !replace) { throw std::runtime_error((llvm::Twine("Attribute builder for '") + @@ -81,6 +86,7 @@ void PyGlobals::registerAttributeBuilder(const std::string &attributeKind, void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID, nb::callable typeCaster, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = typeCasterMap[mlirTypeID]; if (found && !replace) throw std::runtime_error("Type caster is already registered with caster: " + @@ -90,6 +96,7 @@ void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID, void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID, nb::callable valueCaster, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = valueCasterMap[mlirTypeID]; if (found && !replace) throw std::runtime_error("Value caster is already registered: " + @@ -99,6 +106,7 @@ void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID, void PyGlobals::registerDialectImpl(const std::string &dialectNamespace, nb::object pyClass) { + nb::ft_lock_guard lock(mutex); nb::object &found = dialectClassMap[dialectNamespace]; if (found) { throw std::runtime_error((llvm::Twine("Dialect namespace '") + @@ -110,6 +118,7 @@ void PyGlobals::registerDialectImpl(const std::string &dialectNamespace, void PyGlobals::registerOperationImpl(const std::string &operationName, nb::object pyClass, bool replace) { + nb::ft_lock_guard lock(mutex); nb::object &found = operationClassMap[operationName]; if (found && !replace) { throw std::runtime_error((llvm::Twine("Operation '") + operationName + @@ -121,6 +130,7 @@ void PyGlobals::registerOperationImpl(const std::string &operationName, std::optional PyGlobals::lookupAttributeBuilder(const std::string &attributeKind) { + nb::ft_lock_guard lock(mutex); const auto foundIt = attributeBuilderMap.find(attributeKind); if (foundIt != attributeBuilderMap.end()) { assert(foundIt->second && "attribute builder is defined"); @@ -133,6 +143,7 @@ std::optional PyGlobals::lookupTypeCaster(MlirTypeID mlirTypeID, MlirDialect dialect) { // Try to load dialect module. (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect))); + nb::ft_lock_guard lock(mutex); const auto foundIt = typeCasterMap.find(mlirTypeID); if (foundIt != typeCasterMap.end()) { assert(foundIt->second && "type caster is defined"); @@ -145,6 +156,7 @@ std::optional PyGlobals::lookupValueCaster(MlirTypeID mlirTypeID, MlirDialect dialect) { // Try to load dialect module. (void)loadDialectModule(unwrap(mlirDialectGetNamespace(dialect))); + nb::ft_lock_guard lock(mutex); const auto foundIt = valueCasterMap.find(mlirTypeID); if (foundIt != valueCasterMap.end()) { assert(foundIt->second && "value caster is defined"); @@ -158,6 +170,7 @@ PyGlobals::lookupDialectClass(const std::string &dialectNamespace) { // Make sure dialect module is loaded. if (!loadDialectModule(dialectNamespace)) return std::nullopt; + nb::ft_lock_guard lock(mutex); const auto foundIt = dialectClassMap.find(dialectNamespace); if (foundIt != dialectClassMap.end()) { assert(foundIt->second && "dialect class is defined"); @@ -175,6 +188,7 @@ PyGlobals::lookupOperationClass(llvm::StringRef operationName) { if (!loadDialectModule(dialectNamespace)) return std::nullopt; + nb::ft_lock_guard lock(mutex); auto foundIt = operationClassMap.find(operationName); if (foundIt != operationClassMap.end()) { assert(foundIt->second && "OpView is defined"); diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 8fb32a225e65f..f5fbb6c61b57e 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -260,6 +260,7 @@ class PyMlirContext { // Note that this holds a handle, which does not imply ownership. // Mappings will be removed when the context is destructed. using LiveContextMap = llvm::DenseMap; + static nanobind::ft_mutex live_contexts_mutex; static LiveContextMap &getLiveContexts(); // Interns all live modules associated with this context. Modules tracked diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp index 7c4064262012e..6f49431006605 100644 --- a/mlir/lib/Bindings/Python/MainModule.cpp +++ b/mlir/lib/Bindings/Python/MainModule.cpp @@ -30,12 +30,8 @@ NB_MODULE(_mlir, m) { .def_prop_rw("dialect_search_modules", &PyGlobals::getDialectSearchPrefixes, &PyGlobals::setDialectSearchPrefixes) - .def( - "append_dialect_search_prefix", - [](PyGlobals &self, std::string moduleName) { - self.getDialectSearchPrefixes().push_back(std::move(moduleName)); - }, - "module_name"_a) + .def("append_dialect_search_prefix", &PyGlobals::addDialectSearchPrefix, + "module_name"_a) .def( "_check_dialect_module_loaded", [](PyGlobals &self, const std::string &dialectNamespace) { @@ -76,7 +72,6 @@ NB_MODULE(_mlir, m) { nanobind::cast(opClass.attr("OPERATION_NAME")); PyGlobals::get().registerOperationImpl(operationName, opClass, replace); - // Dict-stuff the new opClass by name onto the dialect class. nb::object opClassName = opClass.attr("__name__"); dialectClass.attr(opClassName) = opClass; diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt index f240d6ef944ec..1a0075e829aef 100644 --- a/mlir/python/requirements.txt +++ b/mlir/python/requirements.txt @@ -2,4 +2,5 @@ nanobind>=2.4, <3.0 numpy>=1.19.5, <=2.1.2 pybind11>=2.10.0, <=2.13.6 PyYAML>=5.4.0, <=6.0.1 -ml_dtypes>=0.1.0, <=0.5.0 # provides several NumPy dtype extensions, including the bf16 +ml_dtypes>=0.1.0, <=0.6.0; python_version<"3.13" # provides several NumPy dtype extensions, including the bf16 +ml_dtypes>=0.5.0, <=0.6.0; python_version>="3.13" \ No newline at end of file diff --git a/mlir/test/python/multithreaded_tests.py b/mlir/test/python/multithreaded_tests.py new file mode 100644 index 0000000000000..6e1a668346872 --- /dev/null +++ b/mlir/test/python/multithreaded_tests.py @@ -0,0 +1,518 @@ +# RUN: %PYTHON %s +""" +This script generates multi-threaded tests to check free-threading mode using CPython compiled with TSAN. +Tests can be run using pytest: +```bash +python3.13t -mpytest -vvv multithreaded_tests.py +``` + +IMPORTANT. Running tests are not checking the correctness, but just the execution of the tests in multi-threaded context +and passing if no warnings reported by TSAN and failing otherwise. + + +Details on the generated tests and execution: +1) Multi-threaded execution: all generated tests are executed independently by +a pool of threads, running each test multiple times, see @multi_threaded for details + +2) Tests generation: we use existing tests: test/python/ir/*.py, +test/python/dialects/*.py, etc to generate multi-threaded tests. +In details, we perform the following: +a) we define a list of source tests to be used to generate multi-threaded tests, see `TEST_MODULES`. +b) we define `TestAllMultiThreaded` class and add existing tests to the class. See `add_existing_tests` method. +c) for each test file, we copy and modify it: test/python/ir/affine_expr.py -> /tmp/ir/affine_expr.py. +In order to import the test file as python module, we remove all executing functions, like +`@run` or `run(testMethod)`. See `copy_and_update` and `add_existing_tests` methods for details. + + +Observed warnings reported by TSAN. + +CPython and free-threading known data-races: +1) ctypes related races: https://github.com/python/cpython/issues/127945 +2) LLVM related data-races, llvm::raw_ostream is not thread-safe +- mlir pass manager +- dialects/transform_interpreter.py +- ir/diagnostic_handler.py +- ir/module.py +3) Dialect gpu module-to-binary method is unsafe +""" +import concurrent.futures +import gc +import importlib.util +import os +import sys +import threading +import tempfile +import unittest + +from contextlib import contextmanager +from functools import partial +from pathlib import Path +from typing import Optional, List + +import mlir.dialects.arith as arith +from mlir.dialects import transform +from mlir.ir import Context, Location, Module, IntegerType, InsertionPoint + + +def import_from_path(module_name: str, file_path: Path): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +def copy_and_update(src_filepath: Path, dst_filepath: Path): + # We should remove all calls like `run(testMethod)` + with open(src_filepath, "r") as reader, open(dst_filepath, "w") as writer: + while True: + src_line = reader.readline() + if len(src_line) == 0: + break + skip_lines = [ + "run(", + "@run", + "@constructAndPrintInModule", + "run_apply_patterns(", + "@run_apply_patterns", + "@test_in_context", + "@construct_and_print_in_module", + ] + if any(src_line.startswith(line) for line in skip_lines): + continue + writer.write(src_line) + + +# Helper run functions +def run(f): + f() + + +def run_with_context_and_location(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + f() + return f + + +def run_with_insertion_point(f): + print("\nTEST:", f.__name__) + with Context() as ctx, Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + f(ctx) + print(module) + + +def run_with_insertion_point_v2(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + f() + print(module) + return f + + +def run_with_insertion_point_v3(f): + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + print("\nTEST:", f.__name__) + f(module) + print(module) + return f + + +def run_with_insertion_point_v4(f): + print("\nTEST:", f.__name__) + with Context() as ctx, Location.unknown(): + ctx.allow_unregistered_dialects = True + module = Module.create() + with InsertionPoint(module.body): + f() + return f + + +def run_apply_patterns(f): + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.AnyOpType.get(), + ) + with InsertionPoint(sequence.body): + apply = transform.ApplyPatternsOp(sequence.bodyTarget) + with InsertionPoint(apply.patterns): + f() + transform.YieldOp() + print("\nTEST:", f.__name__) + print(module) + return f + + +def run_transform_tensor_ext(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.AnyOpType.get(), + ) + with InsertionPoint(sequence.body): + f(sequence.bodyTarget) + transform.YieldOp() + print(module) + return f + + +def run_transform_structured_ext(f): + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + print("\nTEST:", f.__name__) + f() + module.operation.verify() + print(module) + return f + + +def run_construct_and_print_in_module(f): + print("\nTEST:", f.__name__) + with Context(), Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + module = f(module) + if module is not None: + print(module) + return f + + +TEST_MODULES = [ + ("execution_engine", run), + ("pass_manager", run), + ("dialects/affine", run_with_insertion_point_v2), + ("dialects/func", run_with_insertion_point_v2), + ("dialects/arith_dialect", run), + ("dialects/arith_llvm", run), + ("dialects/async_dialect", run), + ("dialects/builtin", run), + ("dialects/cf", run_with_insertion_point_v4), + ("dialects/complex_dialect", run), + ("dialects/func", run_with_insertion_point_v2), + ("dialects/index_dialect", run_with_insertion_point), + ("dialects/llvm", run_with_insertion_point_v2), + ("dialects/math_dialect", run), + ("dialects/memref", run), + ("dialects/ml_program", run_with_insertion_point_v2), + ("dialects/nvgpu", run_with_insertion_point_v2), + ("dialects/nvvm", run_with_insertion_point_v2), + ("dialects/ods_helpers", run), + ("dialects/openmp_ops", run_with_insertion_point_v2), + ("dialects/pdl_ops", run_with_insertion_point_v2), + # ("dialects/python_test", run), # TODO: Need to pass pybind11 or nanobind argv + ("dialects/quant", run), + ("dialects/rocdl", run_with_insertion_point_v2), + ("dialects/scf", run_with_insertion_point_v2), + ("dialects/shape", run), + ("dialects/spirv_dialect", run), + ("dialects/tensor", run), + # ("dialects/tosa", ), # Nothing to test + ("dialects/transform_bufferization_ext", run_with_insertion_point_v2), + # ("dialects/transform_extras", ), # Needs a more complicated execution schema + ("dialects/transform_gpu_ext", run_transform_tensor_ext), + ( + "dialects/transform_interpreter", + run_with_context_and_location, + ["print_", "transform_options", "failed", "include"], + ), + ( + "dialects/transform_loop_ext", + run_with_insertion_point_v2, + ["loopOutline"], + ), + ("dialects/transform_memref_ext", run_with_insertion_point_v2), + ("dialects/transform_nvgpu_ext", run_with_insertion_point_v2), + ("dialects/transform_sparse_tensor_ext", run_transform_tensor_ext), + ("dialects/transform_structured_ext", run_transform_structured_ext), + ("dialects/transform_tensor_ext", run_transform_tensor_ext), + ( + "dialects/transform_vector_ext", + run_apply_patterns, + ["configurable_patterns"], + ), + ("dialects/transform", run_with_insertion_point_v3), + ("dialects/vector", run_with_context_and_location), + ("dialects/gpu/dialect", run_with_context_and_location), + ("dialects/gpu/module-to-binary-nvvm", run_with_context_and_location), + ("dialects/gpu/module-to-binary-rocdl", run_with_context_and_location), + ("dialects/linalg/ops", run), + # TO ADD: No proper tests in this dialects/linalg/opsdsl/* + # ("dialects/linalg/opsdsl/*", ...), + ("dialects/sparse_tensor/dialect", run), + ("dialects/sparse_tensor/passes", run), + ("integration/dialects/pdl", run_construct_and_print_in_module), + ("integration/dialects/transform", run_construct_and_print_in_module), + ("integration/dialects/linalg/opsrun", run), + ("ir/affine_expr", run), + ("ir/affine_map", run), + ("ir/array_attributes", run), + ("ir/attributes", run), + ("ir/blocks", run), + ("ir/builtin_types", run), + ("ir/context_managers", run), + ("ir/debug", run), + ("ir/diagnostic_handler", run), + ("ir/dialects", run), + ("ir/exception", run), + ("ir/insertion_point", run), + ("ir/integer_set", run), + ("ir/location", run), + ("ir/module", run), + ("ir/operation", run), + ("ir/symbol_table", run), + ("ir/value", run), +] + +TESTS_TO_SKIP = [ + "test_execution_engine__testNanoTime_multi_threaded", # testNanoTime can't run in multiple threads, even with GIL + "test_execution_engine__testSharedLibLoad_multi_threaded", # testSharedLibLoad can't run in multiple threads, even with GIL + "test_dialects_arith_dialect__testArithValue_multi_threaded", # RuntimeError: Value caster is already registered: .ArithValue'>, even with GIL + "test_ir_dialects__testAppendPrefixSearchPath_multi_threaded", # PyGlobals::setDialectSearchPrefixes is not thread-safe, even with GIL. Strange usage of static PyGlobals vs python exposed _cext.globals + "test_ir_value__testValueCasters_multi_threaded", # RuntimeError: Value caster is already registered: .dont_cast_int, even with GIL + # tests indirectly calling thread-unsafe llvm::raw_ostream + "test_execution_engine__testInvalidModule_multi_threaded", # mlirExecutionEngineCreate calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrAfterAll_multi_threaded", # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrBeforeAndAfterAll_multi_threaded", # IRPrinterInstrumentation::runBeforePass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrLargeLimitElements_multi_threaded", # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testPrintIrTree_multi_threaded", # IRPrinterInstrumentation::runAfterPass calls thread-unsafe llvm::raw_ostream + "test_pass_manager__testRunPipeline_multi_threaded", # PrintOpStatsPass::printSummary calls thread-unsafe llvm::raw_ostream + "test_dialects_transform_interpreter__include_multi_threaded", # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream + "test_dialects_transform_interpreter__transform_options_multi_threaded", # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) calls thread-unsafe llvm::raw_ostream + "test_dialects_transform_interpreter__print_self_multi_threaded", # mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) call thread-unsafe llvm::raw_ostream + "test_ir_diagnostic_handler__testDiagnosticCallbackException_multi_threaded", # mlirEmitError calls thread-unsafe llvm::raw_ostream + "test_ir_module__testParseSuccess_multi_threaded", # mlirOperationDump calls thread-unsafe llvm::raw_ostream + # False-positive TSAN detected race in llvm::RuntimeDyldELF::registerEHFrames() + # Details: https://github.com/llvm/llvm-project/pull/107103/files#r1905726947 + "test_execution_engine__testCapsule_multi_threaded", + "test_execution_engine__testDumpToObjectFile_multi_threaded", +] + +TESTS_TO_XFAIL = [ + # execution_engine tests: + # - ctypes related data-races: https://github.com/python/cpython/issues/127945 + "test_execution_engine__testBF16Memref_multi_threaded", + "test_execution_engine__testBasicCallback_multi_threaded", + "test_execution_engine__testComplexMemrefAdd_multi_threaded", + "test_execution_engine__testComplexUnrankedMemrefAdd_multi_threaded", + "test_execution_engine__testDynamicMemrefAdd2D_multi_threaded", + "test_execution_engine__testF16MemrefAdd_multi_threaded", + "test_execution_engine__testF8E5M2Memref_multi_threaded", + "test_execution_engine__testInvokeFloatAdd_multi_threaded", + "test_execution_engine__testInvokeVoid_multi_threaded", # a ctypes race + "test_execution_engine__testMemrefAdd_multi_threaded", + "test_execution_engine__testRankedMemRefCallback_multi_threaded", + "test_execution_engine__testRankedMemRefWithOffsetCallback_multi_threaded", + "test_execution_engine__testUnrankedMemRefCallback_multi_threaded", + "test_execution_engine__testUnrankedMemRefWithOffsetCallback_multi_threaded", + # dialects tests + "test_dialects_memref__testSubViewOpInferReturnTypeExtensiveSlicing_multi_threaded", # Related to ctypes data races + "test_dialects_transform_interpreter__print_other_multi_threaded", # Fatal Python error: Aborted or mlir::transform::PrintOp::apply(mlir::transform::TransformRewriter...) is not thread-safe + "test_dialects_gpu_module-to-binary-rocdl__testGPUToASMBin_multi_threaded", # Due to global llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp::GCNTrackers variable mutation + "test_dialects_gpu_module-to-binary-nvvm__testGPUToASMBin_multi_threaded", + "test_dialects_gpu_module-to-binary-nvvm__testGPUToLLVMBin_multi_threaded", + "test_dialects_gpu_module-to-binary-rocdl__testGPUToLLVMBin_multi_threaded", + # integration tests + "test_integration_dialects_linalg_opsrun__test_elemwise_builtin_multi_threaded", # Related to ctypes data races + "test_integration_dialects_linalg_opsrun__test_elemwise_generic_multi_threaded", # Related to ctypes data races + "test_integration_dialects_linalg_opsrun__test_fill_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_fill_generic_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_fill_rng_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_fill_rng_generic_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_max_pooling_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_max_pooling_generic_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_min_pooling_builtin_multi_threaded", # ctypes + "test_integration_dialects_linalg_opsrun__test_min_pooling_generic_multi_threaded", # ctypes +] + + +def add_existing_tests(test_modules, test_prefix: str = "_original_test"): + def decorator(test_cls): + this_folder = Path(__file__).parent.absolute() + test_cls.output_folder = tempfile.TemporaryDirectory() + output_folder = Path(test_cls.output_folder.name) + + for test_mod_info in test_modules: + assert isinstance(test_mod_info, tuple) and len(test_mod_info) in (2, 3) + if len(test_mod_info) == 2: + test_module_name, exec_fn = test_mod_info + test_pattern = None + else: + test_module_name, exec_fn, test_pattern = test_mod_info + + src_filepath = this_folder / f"{test_module_name}.py" + dst_filepath = (output_folder / f"{test_module_name}.py").absolute() + if not dst_filepath.parent.exists(): + dst_filepath.parent.mkdir(parents=True) + copy_and_update(src_filepath, dst_filepath) + test_mod = import_from_path(test_module_name, dst_filepath) + for attr_name in dir(test_mod): + is_test_fn = test_pattern is None and attr_name.startswith("test") + is_test_fn |= test_pattern is not None and any( + [p in attr_name for p in test_pattern] + ) + if is_test_fn: + obj = getattr(test_mod, attr_name) + if callable(obj): + test_name = f"{test_prefix}_{test_module_name.replace('/', '_')}__{attr_name}" + + def wrapped_test_fn( + self, *args, __test_fn__=obj, __exec_fn__=exec_fn, **kwargs + ): + __exec_fn__(__test_fn__) + + setattr(test_cls, test_name, wrapped_test_fn) + return test_cls + + return decorator + + +@contextmanager +def _capture_output(fp): + # Inspired from jax test_utils.py capture_stderr method + # ``None`` means nothing has not been captured yet. + captured = None + + def get_output() -> str: + if captured is None: + raise ValueError("get_output() called while the context is active.") + return captured + + with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f: + original_fd = os.dup(fp.fileno()) + os.dup2(f.fileno(), fp.fileno()) + try: + yield get_output + finally: + # Python also has its own buffers, make sure everything is flushed. + fp.flush() + os.fsync(fp.fileno()) + f.seek(0) + captured = f.read() + os.dup2(original_fd, fp.fileno()) + + +capture_stdout = partial(_capture_output, sys.stdout) +capture_stderr = partial(_capture_output, sys.stderr) + + +def multi_threaded( + num_workers: int, + num_runs: int = 5, + skip_tests: Optional[List[str]] = None, + xfail_tests: Optional[List[str]] = None, + test_prefix: str = "_original_test", + multithreaded_test_postfix: str = "_multi_threaded", +): + """Decorator that runs a test in a multi-threaded environment.""" + + def decorator(test_cls): + for name, test_fn in test_cls.__dict__.copy().items(): + if not (name.startswith(test_prefix) and callable(test_fn)): + continue + + name = f"test{name[len(test_prefix):]}" + if skip_tests is not None: + if any( + test_name.replace(multithreaded_test_postfix, "") in name + for test_name in skip_tests + ): + continue + + def multi_threaded_test_fn(self, *args, __test_fn__=test_fn, **kwargs): + with capture_stdout(), capture_stderr() as get_output: + barrier = threading.Barrier(num_workers) + + def closure(): + barrier.wait() + for _ in range(num_runs): + __test_fn__(self, *args, **kwargs) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=num_workers + ) as executor: + futures = [] + for _ in range(num_workers): + futures.append(executor.submit(closure)) + # We should call future.result() to re-raise an exception if test has + # failed + assert len(list(f.result() for f in futures)) == num_workers + + gc.collect() + assert Context._get_live_count() == 0 + + captured = get_output() + if len(captured) > 0 and "ThreadSanitizer" in captured: + raise RuntimeError( + f"ThreadSanitizer reported warnings:\n{captured}" + ) + + test_new_name = f"{name}{multithreaded_test_postfix}" + if xfail_tests is not None and test_new_name in xfail_tests: + multi_threaded_test_fn = unittest.expectedFailure( + multi_threaded_test_fn + ) + + setattr(test_cls, test_new_name, multi_threaded_test_fn) + + return test_cls + + return decorator + + +@multi_threaded( + num_workers=10, + num_runs=20, + skip_tests=TESTS_TO_SKIP, + xfail_tests=TESTS_TO_XFAIL, +) +@add_existing_tests(test_modules=TEST_MODULES, test_prefix="_original_test") +class TestAllMultiThreaded(unittest.TestCase): + @classmethod + def tearDownClass(cls): + if hasattr(cls, "output_folder"): + cls.output_folder.cleanup() + + def _original_test_create_context(self): + with Context() as ctx: + print(ctx._get_live_count()) + print(ctx._get_live_module_count()) + print(ctx._get_live_operation_count()) + print(ctx._get_live_operation_objects()) + print(ctx._get_context_again() is ctx) + print(ctx._clear_live_operations()) + + def _original_test_create_module_with_consts(self): + py_values = [123, 234, 345] + with Context() as ctx: + module = Module.create(loc=Location.file("foo.txt", 0, 0)) + + dtype = IntegerType.get_signless(64) + with InsertionPoint(module.body), Location.name("a"): + arith.constant(dtype, py_values[0]) + + with InsertionPoint(module.body), Location.name("b"): + arith.constant(dtype, py_values[1]) + + with InsertionPoint(module.body), Location.name("c"): + arith.constant(dtype, py_values[2]) + + +if __name__ == "__main__": + # Do not run the tests on CPython with GIL + if hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled(): + unittest.main() From a5cd85afb1365e592d08202bb8b026362cf06668 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Mon, 13 Jan 2025 12:03:06 +0100 Subject: [PATCH 029/102] Fix an unused-variable warning in release build. --- llvm/lib/TableGen/TGLexer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index c423023077cd8..983242ade0fe5 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -814,7 +814,7 @@ tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, if (PrepIncludeStack.back().empty()) return ReturnError(TokStart, "#endif without #ifdef"); - auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); + [[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); assert((IfdefOrElseEntry.Kind == tgtok::Ifdef || IfdefOrElseEntry.Kind == tgtok::Else) && From 25328a9321a20f661c0ba4687b03cfc53128fb96 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Mon, 13 Jan 2025 12:04:21 +0100 Subject: [PATCH 030/102] Remove an extra trailing `` in Modules.rst, NFC --- clang/docs/Modules.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst index 06294e3c58a4f..69a45b7fd9ace 100644 --- a/clang/docs/Modules.rst +++ b/clang/docs/Modules.rst @@ -152,7 +152,7 @@ first include path that would refer to the current file. ``#include_next`` is interpreted as if the current file had been found in that path. If this search finds a file named by a module map, the ``#include_next`` directive is translated into an import, just like for a ``#include`` -directive.`` +directive. Module maps ----------- From d04031ab54776d49603c296a2dcac1475ee138e9 Mon Sep 17 00:00:00 2001 From: quic_hchandel <165007698+hchandel@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:36:05 +0530 Subject: [PATCH 031/102] [RISCV] Add Qualcomm uC Xqciint (Interrupts) extension (#122256) This extension adds eleven instructions to accelerate interrupt servicing. The current spec can be found at: https://github.com/quic/riscv-unified-db/releases/latest This patch adds assembler only support. --------- Co-authored-by: Harsh Chandel --- .../Driver/print-supported-extensions-riscv.c | 1 + llvm/docs/RISCVUsage.rst | 3 + llvm/docs/ReleaseNotes.md | 2 + .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 3 + .../RISCV/Disassembler/RISCVDisassembler.cpp | 4 + .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 8 ++ llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 1 + llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 64 +++++++++++ llvm/lib/TargetParser/RISCVISAInfo.cpp | 4 +- llvm/test/CodeGen/RISCV/attributes.ll | 2 + llvm/test/MC/RISCV/xqciint-invalid.s | 105 ++++++++++++++++++ llvm/test/MC/RISCV/xqciint-valid.s | 81 ++++++++++++++ .../TargetParser/RISCVISAInfoTest.cpp | 3 +- 14 files changed, 279 insertions(+), 3 deletions(-) create mode 100644 llvm/test/MC/RISCV/xqciint-invalid.s create mode 100644 llvm/test/MC/RISCV/xqciint-valid.s diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index a8d9fcd8569cf..b28e0a07dad24 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -196,6 +196,7 @@ // CHECK-NEXT: xqcicm 0.2 'Xqcicm' (Qualcomm uC Conditional Move Extension) // CHECK-NEXT: xqcics 0.2 'Xqcics' (Qualcomm uC Conditional Select Extension) // CHECK-NEXT: xqcicsr 0.2 'Xqcicsr' (Qualcomm uC CSR Extension) +// CHECK-NEXT: xqciint 0.2 'Xqciint' (Qualcomm uC Interrupts Extension) // CHECK-NEXT: xqcilsm 0.2 'Xqcilsm' (Qualcomm uC Load Store Multiple Extension) // CHECK-NEXT: xqcisls 0.2 'Xqcisls' (Qualcomm uC Scaled Load Store Extension) // CHECK-EMPTY: diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 0dc63f34806b4..a1df0f7d686e6 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -447,6 +447,9 @@ The current vendor extensions supported are: ``experimental-Xqcicsr`` LLVM implements `version 0.2 of the Qualcomm uC CSR extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. +``experimental-Xqciint`` + LLVM implements `version 0.2 of the Qualcomm uC Interrupts extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. + ``experimental-Xqcilsm`` LLVM implements `version 0.2 of the Qualcomm uC Load Store Multiple extension specification `__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index a3febf27ae833..d1032138a9db0 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -235,6 +235,8 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm uC 'Xqcicm` (Conditonal Move) extension. +* Adds experimental assembler support for the Qualcomm uC 'Xqciint` (Interrupts) + extension. * Added ``Sdext`` and ``Sdtrig`` extensions. Changes to the WebAssembly Backend diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 2205c67c2d21b..8177280044bf4 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -717,6 +717,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isUImm6() const { return IsUImm<6>(); } bool isUImm7() const { return IsUImm<7>(); } bool isUImm8() const { return IsUImm<8>(); } + bool isUImm10() const { return IsUImm<10>(); } bool isUImm11() const { return IsUImm<11>(); } bool isUImm16() const { return IsUImm<16>(); } bool isUImm20() const { return IsUImm<20>(); } @@ -1590,6 +1591,8 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 9), (1 << 9) - 16, "immediate must be a multiple of 16 bytes and non-zero in the range"); + case Match_InvalidUImm10: + return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 10) - 1); case Match_InvalidUImm11: return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 11) - 1); case Match_InvalidSImm12: diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index a490910154eb4..971ef90c63327 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -700,6 +700,8 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size, "Qualcomm uC Conditional Load Immediate custom opcode table"); TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm32, "Qualcomm uC Conditional Move custom opcode table"); + TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint32, + "Qualcomm uC Interrupts custom opcode table"); TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table"); return MCDisassembler::Fail; @@ -732,6 +734,8 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size, TRY_TO_DECODE_FEATURE( RISCV::FeatureVendorXqcicm, DecoderTableXqcicm16, "Qualcomm uC Conditional Move custom 16bit opcode table"); + TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqciint, DecoderTableXqciint16, + "Qualcomm uC Interrupts custom 16bit opcode table"); TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc), DecoderTableXwchc16, "WCH QingKe XW custom opcode table"); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 7048e40822342..ab04b09a7ad15 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -313,6 +313,7 @@ enum OperandType : unsigned { OPERAND_UIMM8_LSB000, OPERAND_UIMM8_GE32, OPERAND_UIMM9_LSB000, + OPERAND_UIMM10, OPERAND_UIMM10_LSB00_NONZERO, OPERAND_UIMM11, OPERAND_UIMM12, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 01bc5387e672e..f721d7148526b 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1302,6 +1302,14 @@ def HasVendorXqcicm AssemblerPredicate<(all_of FeatureVendorXqcicm), "'Xqcicm' (Qualcomm uC Conditional Move Extension)">; +def FeatureVendorXqciint + : RISCVExperimentalExtension<0, 2, "Qualcomm uC Interrupts Extension", + [FeatureStdExtZca]>; +def HasVendorXqciint + : Predicate<"Subtarget->hasVendorXqciint()">, + AssemblerPredicate<(all_of FeatureVendorXqciint), + "'Xqciint' (Qualcomm uC Interrupts Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index f24940795e433..1f7e8d87a11b0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2473,6 +2473,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, CASE_OPERAND_UIMM(6) CASE_OPERAND_UIMM(7) CASE_OPERAND_UIMM(8) + CASE_OPERAND_UIMM(10) CASE_OPERAND_UIMM(12) CASE_OPERAND_UIMM(20) // clang-format on diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 6f15646852f91..ce8c0c0a3d4e5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -28,6 +28,8 @@ def uimm5gt3 : RISCVOp, ImmLeaf; + def uimm11 : RISCVUImmLeafOp<11>; //===----------------------------------------------------------------------===// @@ -166,6 +168,36 @@ class QCIMVCCI funct3, string opcodestr, DAGOperand immType> let rs2 = imm; } +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +class QCIRVInst16CI_RS1 funct5, string OpcodeStr> + : RVInst16CI<0b000, 0b10, (outs), (ins GPRNoX0:$rs1), OpcodeStr, "$rs1"> { + bits<5> rs1; + + let Inst{12} = 0b1; + let Inst{11-7} = rs1; + let Inst{6-2} = funct5{4-0}; +} + +let hasSideEffects = 1 in +class QCIRVInst16CI_NONE funct5, string OpcodeStr> + : RVInst16CI<0b000, 0b10, (outs), (ins), OpcodeStr, ""> { + let Inst{12} = 0b1; + let Inst{11-7} = funct5; + let Inst{6-2} = 0b00100; +} + +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +class QCIInt_IMM funct1, string opcodestr> + : RVInstIBase<0b000, OPC_SYSTEM, (outs), (ins uimm10:$imm10), opcodestr, + "$imm10"> { + bits<10> imm10; + + let rd = 0; + let rs1 = imm10{4-0}; + let Inst{31-25} = {0b110011, funct1}; + let Inst{24-20} = imm10{9-5}; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -312,6 +344,38 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def QC_MVGEUI : QCIMVCCI<0b111, "qc.mvgeui", uimm5>; } // Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm" +let Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" in { + let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in + def QC_C_DIR : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd), (ins), + "qc.c.dir", "$rd"> { + bits<5> rd; + + let Inst{12} = 0b1; + let Inst{11-7} = rd; + let Inst{6-2} = 0b00000; + } + + def QC_SETINTI : QCIInt_IMM<0b0, "qc.setinti">; + def QC_CLRINTI : QCIInt_IMM<0b1, "qc.clrinti">; + + def QC_C_EIR : QCIRVInst16CI_RS1<0b00001, "qc.c.eir">; + def QC_C_SETINT : QCIRVInst16CI_RS1<0b00010, "qc.c.setint">; + def QC_C_CLRINT : QCIRVInst16CI_RS1<0b00011, "qc.c.clrint">; + + let mayLoad = 0, mayStore = 0 in { + def QC_C_DI : QCIRVInst16CI_NONE<0b10110, "qc.c.di">; + def QC_C_EI : QCIRVInst16CI_NONE<0b10111, "qc.c.ei">; + } // mayLoad =0, mayStore = 0 + + let mayLoad = 1, mayStore = 1 in { + def QC_C_MIENTER : QCIRVInst16CI_NONE<0b10000, "qc.c.mienter">; + def QC_C_MIENTER_NEST : QCIRVInst16CI_NONE<0b10001, "qc.c.mienter.nest">; + } // mayLoad = 1, mayStore = 1 + + let mayLoad = 1, mayStore = 1, isReturn = 1, isTerminator = 1 in + def QC_C_MILEAVERET : QCIRVInst16CI_NONE<0b10100, "qc.c.mileaveret">; +} // Predicates = [HasVendorXqciint, IsRV32], DecoderNamespace = "Xqciint" + //===----------------------------------------------------------------------===// // Aliases //===----------------------------------------------------------------------===// diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index d6e1eac0d85af..1995931abfe41 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -742,8 +742,8 @@ Error RISCVISAInfo::checkDependency() { bool HasZvl = MinVLen != 0; bool HasZcmt = Exts.count("zcmt") != 0; static constexpr StringLiteral XqciExts[] = { - {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcicm"}, - {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}}; + {"xqcia"}, {"xqciac"}, {"xqcicli"}, {"xqcicm"}, {"xqcics"}, + {"xqcicsr"}, {"xqciint"}, {"xqcilsm"}, {"xqcisls"}}; if (HasI && HasE) return getIncompatibleError("i", "e"); diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index c0fcc6f611111..a09261609d844 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -87,6 +87,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm %s -o - | FileCheck --check-prefix=RV32XQCICM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciint %s -o - | FileCheck --check-prefix=RV32XQCIINT %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s @@ -401,6 +402,7 @@ ; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2" ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2" ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2" +; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p2" ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2" ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2" ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0" diff --git a/llvm/test/MC/RISCV/xqciint-invalid.s b/llvm/test/MC/RISCV/xqciint-invalid.s new file mode 100644 index 0000000000000..e748109f41d82 --- /dev/null +++ b/llvm/test/MC/RISCV/xqciint-invalid.s @@ -0,0 +1,105 @@ +# Xqciint - Qualcomm uC Interrupts extension +# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqciint < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-IMM %s +# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqciint < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-EXT %s + +# CHECK-IMM: :[[@LINE+1]]:12: error: immediate must be an integer in the range [0, 1023] +qc.setinti 1025 + +# CHECK: :[[@LINE+1]]:16: error: invalid operand for instruction +qc.setinti 11, 12 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.setinti + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.setinti 10 + + +# CHECK-IMM: :[[@LINE+1]]:12: error: immediate must be an integer in the range [0, 1023] +qc.clrinti 2000 + +# CHECK: :[[@LINE+1]]:16: error: invalid operand for instruction +qc.clrinti 22, x4 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.clrinti + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.clrinti 8 + + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.c.clrint 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.clrint + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.clrint x8 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.c.di 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.di + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.c.dir 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.dir + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.dir x8 + + +# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction +qc.c.ei 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.ei + + +# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction +qc.c.eir 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.eir + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.eir x8 + + +# CHECK: :[[@LINE+1]]:19: error: invalid operand for instruction +qc.c.mienter.nest 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.mienter.nest + + +# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +qc.c.mienter 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.mienter + + +# CHECK: :[[@LINE+1]]:17: error: invalid operand for instruction +qc.c.mileaveret 22 + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.mileaveret + + +# CHECK: :[[@LINE+1]]:13: error: invalid operand for instruction +qc.c.setint 22 + +# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction +qc.c.setint + +# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciint' (Qualcomm uC Interrupts Extension) +qc.c.setint x8 diff --git a/llvm/test/MC/RISCV/xqciint-valid.s b/llvm/test/MC/RISCV/xqciint-valid.s new file mode 100644 index 0000000000000..c05a402b5b14a --- /dev/null +++ b/llvm/test/MC/RISCV/xqciint-valid.s @@ -0,0 +1,81 @@ +# Xqciint - Qualcomm uC Interrupts extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciint -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciint < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqciint -M no-aliases --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqciint -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqciint < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqciint --no-print-imm-hex -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s + +# CHECK-INST: qc.setinti 500 +# CHECK-ENC: encoding: [0x73,0x00,0xfa,0xcc] +qc.setinti 500 + +# CHECK-INST: qc.setinti 0 +# CHECK-ENC: encoding: [0x73,0x00,0x00,0xcc] +qc.setinti 0 + +# CHECK-INST: qc.setinti 1023 +# CHECK-ENC: encoding: [0x73,0x80,0xff,0xcd] +qc.setinti 1023 + + +# CHECK-INST: qc.clrinti 500 +# CHECK-ENC: encoding: [0x73,0x00,0xfa,0xce] +qc.clrinti 500 + +# CHECK-INST: qc.clrinti 1023 +# CHECK-ENC: encoding: [0x73,0x80,0xff,0xcf] +qc.clrinti 1023 + +# CHECK-INST: qc.clrinti 0 +# CHECK-ENC: encoding: [0x73,0x00,0x00,0xce] +qc.clrinti 0 + + +# CHECK-INST: qc.c.clrint a0 +# CHECK-ENC: encoding: [0x0e,0x15] +qc.c.clrint x10 + + +# CHECK-INST: qc.c.di +# CHECK-ENC: encoding: [0x12,0x1b] +qc.c.di + + +# CHECK-INST: qc.c.dir a0 +# CHECK-ENC: encoding: [0x02,0x15] +qc.c.dir x10 + + +# CHECK-INST: qc.c.ei +# CHECK-ENC: encoding: [0x92,0x1b] +qc.c.ei + + +# CHECK-INST: qc.c.eir a0 +# CHECK-ENC: encoding: [0x06,0x15] +qc.c.eir x10 + + +# CHECK-INST: qc.c.mienter.nest +# CHECK-ENC: encoding: [0x92,0x18] +qc.c.mienter.nest + + +# CHECK-INST: qc.c.mienter +# CHECK-ENC: encoding: [0x12,0x18] +qc.c.mienter + + +# CHECK-INST: qc.c.mileaveret +# CHECK-ENC: encoding: [0x12,0x1a] +qc.c.mileaveret + + +# CHECK-INST: qc.c.setint a0 +# CHECK-ENC: encoding: [0x0a,0x15] +qc.c.setint x10 diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 3955d36fce896..3a7ea4550d417 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -656,7 +656,7 @@ TEST(ParseArchString, RejectsConflictingExtensions) { for (StringRef Input : {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2", "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcicm0p2", - "rv64i_xqcics0p2", "rv64i_xqcicli0p2"}) { + "rv64i_xqcics0p2", "rv64i_xqcicli0p2", "rv64i_xqciint0p2"}) { EXPECT_THAT( toString(RISCVISAInfo::parseArchString(Input, true).takeError()), ::testing::EndsWith(" is only supported for 'rv32'")); @@ -1121,6 +1121,7 @@ Experimental extensions xqcicm 0.2 xqcics 0.2 xqcicsr 0.2 + xqciint 0.2 xqcilsm 0.2 xqcisls 0.2 From 6b23b48ccab6d599bceeaf5358490e24e515b79d Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 13 Jan 2025 11:20:35 +0000 Subject: [PATCH 032/102] Reland "[LoopVectorizer] Add support for partial reductions" with non-phi operand fix. (#121744) This relands the reverted #120721 with a fix for cases where neither reduction operand are the reduction phi. Only 63114239cc8d26225a0ef9920baacfc7cc00fc58 and 63114239cc8d26225a0ef9920baacfc7cc00fc58 are new on top of the reverted PR. --------- Co-authored-by: Nicholas Guy --- .../llvm/Analysis/TargetTransformInfo.h | 44 + .../llvm/Analysis/TargetTransformInfoImpl.h | 9 + llvm/lib/Analysis/TargetTransformInfo.cpp | 18 + .../AArch64/AArch64TargetTransformInfo.h | 63 + .../Transforms/Vectorize/LoopVectorize.cpp | 141 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 59 +- llvm/lib/Transforms/Vectorize/VPlan.h | 63 +- .../Transforms/Vectorize/VPlanAnalysis.cpp | 8 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 80 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../AArch64/fully-unrolled-cost.ll | 20 +- .../partial-reduce-dot-product-epilogue.ll | 213 ++ .../partial-reduce-dot-product-mixed.ll | 206 ++ .../partial-reduce-dot-product-neon.ll | 1375 +++++++++++ .../AArch64/partial-reduce-dot-product.ll | 2164 +++++++++++++++++ .../AArch64/partial-reduce-no-dotprod.ll | 61 + .../LoopVectorize/AArch64/vplan-printing.ll | 94 + 17 files changed, 4588 insertions(+), 31 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 752313ab15858..fe13fc676e303 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -211,6 +211,12 @@ typedef TargetTransformInfo TTI; /// for IR-level transformations. class TargetTransformInfo { public: + enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; + + /// Get the kind of extension that an instruction represents. + static PartialReductionExtendKind + getPartialReductionExtendKind(Instruction *I); + /// Construct a TTI object using a type implementing the \c Concept /// API below. /// @@ -1280,6 +1286,20 @@ class TargetTransformInfo { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const; + /// \return The cost of a partial reduction, which is a reduction from a + /// vector to another vector with fewer elements of larger size. They are + /// represented by the llvm.experimental.partial.reduce.add intrinsic, which + /// takes an accumulator and a binary operation operand that itself is fed by + /// two extends. An example of an operation that uses a partial reduction is a + /// dot product, which reduces two vectors to another of 4 times fewer and 4 + /// times larger elements. + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const; + /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. @@ -2107,6 +2127,20 @@ class TargetTransformInfo::Concept { /// \return if target want to issue a prefetch in address space \p AS. virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0; + /// \return The cost of a partial reduction, which is a reduction from a + /// vector to another vector with fewer elements of larger size. They are + /// represented by the llvm.experimental.partial.reduce.add intrinsic, which + /// takes an accumulator and a binary operation operand that itself is fed by + /// two extends. An example of an operation that uses a partial reduction is a + /// dot product, which reduces two vectors to another of 4 times fewer and 4 + /// times larger elements. + virtual InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp) const = 0; + virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0; virtual InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, @@ -2786,6 +2820,16 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.shouldPrefetchAddressSpace(AS); } + InstructionCost getPartialReductionCost( + unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, + ElementCount VF, PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const override { + return Impl.getPartialReductionCost(Opcode, InputTypeA, InputTypeB, + AccumType, VF, OpAExtend, OpBExtend, + BinOp); + } + unsigned getMaxInterleaveFactor(ElementCount VF) override { return Impl.getMaxInterleaveFactor(VF); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 9c74b2a0c31df..7ac3063ca9a37 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -585,6 +585,15 @@ class TargetTransformInfoImplBase { bool enableWritePrefetching() const { return false; } bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; } + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + TTI::PartialReductionExtendKind OpAExtend, + TTI::PartialReductionExtendKind OpBExtend, + std::optional BinOp = std::nullopt) const { + return InstructionCost::getInvalid(); + } + unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; } InstructionCost getArithmeticInstrCost( diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index b32dffa9f0fe8..df42dc2746daf 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -863,6 +863,15 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const { return TTIImpl->shouldPrefetchAddressSpace(AS); } +InstructionCost TargetTransformInfo::getPartialReductionCost( + unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, + ElementCount VF, PartialReductionExtendKind OpAExtend, + PartialReductionExtendKind OpBExtend, std::optional BinOp) const { + return TTIImpl->getPartialReductionCost(Opcode, InputTypeA, InputTypeB, + AccumType, VF, OpAExtend, OpBExtend, + BinOp); +} + unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } @@ -974,6 +983,15 @@ InstructionCost TargetTransformInfo::getShuffleCost( return Cost; } +TargetTransformInfo::PartialReductionExtendKind +TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { + if (isa(I)) + return PR_SignExtend; + if (isa(I)) + return PR_ZeroExtend; + return PR_None; +} + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 214fb4e352eeb..8e7e590c173ff 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -23,6 +23,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/Support/InstructionCost.h" #include #include @@ -357,6 +358,68 @@ class AArch64TTIImpl : public BasicTTIImplBase { return BaseT::isLegalNTLoad(DataType, Alignment); } + InstructionCost + getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, + Type *AccumType, ElementCount VF, + TTI::PartialReductionExtendKind OpAExtend, + TTI::PartialReductionExtendKind OpBExtend, + std::optional BinOp) const { + + InstructionCost Invalid = InstructionCost::getInvalid(); + InstructionCost Cost(TTI::TCC_Basic); + + if (Opcode != Instruction::Add) + return Invalid; + + if (InputTypeA != InputTypeB) + return Invalid; + + EVT InputEVT = EVT::getEVT(InputTypeA); + EVT AccumEVT = EVT::getEVT(AccumType); + + if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable()) + return Invalid; + if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd())) + return Invalid; + + if (InputEVT == MVT::i8) { + switch (VF.getKnownMinValue()) { + default: + return Invalid; + case 8: + if (AccumEVT == MVT::i32) + Cost *= 2; + else if (AccumEVT != MVT::i64) + return Invalid; + break; + case 16: + if (AccumEVT == MVT::i64) + Cost *= 2; + else if (AccumEVT != MVT::i32) + return Invalid; + break; + } + } else if (InputEVT == MVT::i16) { + // FIXME: Allow i32 accumulator but increase cost, as we would extend + // it to i64. + if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64) + return Invalid; + } else + return Invalid; + + // AArch64 supports lowering mixed extensions to a usdot but only if the + // i8mm or sve/streaming features are available. + if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None || + (OpAExtend != OpBExtend && !ST->hasMatMulInt8() && + !ST->isSVEorStreamingSVEAvailable())) + return Invalid; + + if (!BinOp || *BinOp != Instruction::Mul) + return Invalid; + + return Cost; + } + bool enableOrderedReductions() const { return true; } InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d32a463a996c4..0a13ce902795e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7531,6 +7531,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } continue; } + // The VPlan-based cost model is more accurate for partial reduction and + // comparing against the legacy cost isn't desirable. + if (isa(&R)) + return true; if (Instruction *UI = GetInstructionForCost(&R)) SeenInstrs.insert(UI); } @@ -8751,6 +8755,105 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, return Recipe; } +/// Find all possible partial reductions in the loop and track all of those that +/// are valid so recipes can be formed later. +void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { + // Find all possible partial reductions. + SmallVector, 1> + PartialReductionChains; + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) + if (std::optional> Pair = + getScaledReduction(Phi, RdxDesc, Range)) + PartialReductionChains.push_back(*Pair); + + // A partial reduction is invalid if any of its extends are used by + // something that isn't another partial reduction. This is because the + // extends are intended to be lowered along with the reduction itself. + + // Build up a set of partial reduction bin ops for efficient use checking. + SmallSet PartialReductionBinOps; + for (const auto &[PartialRdx, _] : PartialReductionChains) + PartialReductionBinOps.insert(PartialRdx.BinOp); + + auto ExtendIsOnlyUsedByPartialReductions = + [&PartialReductionBinOps](Instruction *Extend) { + return all_of(Extend->users(), [&](const User *U) { + return PartialReductionBinOps.contains(U); + }); + }; + + // Check if each use of a chain's two extends is a partial reduction + // and only add those that don't have non-partial reduction users. + for (auto Pair : PartialReductionChains) { + PartialReductionChain Chain = Pair.first; + if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && + ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) + ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair)); + } +} + +std::optional> +VPRecipeBuilder::getScaledReduction(PHINode *PHI, + const RecurrenceDescriptor &Rdx, + VFRange &Range) { + // TODO: Allow scaling reductions when predicating. The select at + // the end of the loop chooses between the phi value and most recent + // reduction result, both of which have different VFs to the active lane + // mask when scaling. + if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) + return std::nullopt; + + auto *Update = dyn_cast(Rdx.getLoopExitInstr()); + if (!Update) + return std::nullopt; + + Value *Op = Update->getOperand(0); + Value *PhiOp = Update->getOperand(1); + if (Op == PHI) { + Op = Update->getOperand(1); + PhiOp = Update->getOperand(0); + } + if (PhiOp != PHI) + return std::nullopt; + + auto *BinOp = dyn_cast(Op); + if (!BinOp || !BinOp->hasOneUse()) + return std::nullopt; + + using namespace llvm::PatternMatch; + Value *A, *B; + if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || + !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) + return std::nullopt; + + Instruction *ExtA = cast(BinOp->getOperand(0)); + Instruction *ExtB = cast(BinOp->getOperand(1)); + + TTI::PartialReductionExtendKind OpAExtend = + TargetTransformInfo::getPartialReductionExtendKind(ExtA); + TTI::PartialReductionExtendKind OpBExtend = + TargetTransformInfo::getPartialReductionExtendKind(ExtB); + + PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); + + unsigned TargetScaleFactor = + PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( + A->getType()->getPrimitiveSizeInBits()); + + if (LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + InstructionCost Cost = TTI->getPartialReductionCost( + Update->getOpcode(), A->getType(), B->getType(), PHI->getType(), + VF, OpAExtend, OpBExtend, + std::make_optional(BinOp->getOpcode())); + return Cost.isValid(); + }, + Range)) + return std::make_pair(Chain, TargetScaleFactor); + + return std::nullopt; +} + VPRecipeBase * VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef Operands, @@ -8775,9 +8878,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, - CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc)); + + // If the PHI is used by a partial reduction, set the scale factor. + std::optional> Pair = + getScaledReductionForInstr(RdxDesc.getLoopExitInstr()); + unsigned ScaleFactor = Pair ? Pair->second : 1; + PhiRecipe = new VPReductionPHIRecipe( + Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), + CM.useOrderedReductions(RdxDesc), ScaleFactor); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8809,6 +8917,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); + if (getScaledReductionForInstr(Instr)) + return tryToCreatePartialReduction(Instr, Operands); + if (!shouldWiden(Instr, Range)) return nullptr; @@ -8829,6 +8940,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return tryToWiden(Instr, Operands, VPBB); } +VPRecipeBase * +VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, + ArrayRef Operands) { + assert(Operands.size() == 2 && + "Unexpected number of operands for partial reduction"); + + VPValue *BinOp = Operands[0]; + VPValue *Phi = Operands[1]; + if (isa(BinOp->getDefiningRecipe())) + std::swap(BinOp, Phi); + + return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, + Reduction); +} + void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -9252,7 +9378,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, + Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -9298,6 +9425,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); return Legal->blockNeedsPredication(BB) || NeedsBlends; }); + + RecipeBuilder.collectScaledReductions(Range); + auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { @@ -9521,7 +9651,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // Collect mapping of IR header phis to header phi recipes, to be used in // addScalarResumePhis. - VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, + Builder); for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { if (isa(&R)) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 5d4a3b555981c..cf653e2d3e658 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -21,8 +21,28 @@ namespace llvm { class LoopVectorizationLegality; class LoopVectorizationCostModel; class TargetLibraryInfo; +class TargetTransformInfo; struct HistogramInfo; +/// A chain of instructions that form a partial reduction. +/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))), +/// accumulator). +struct PartialReductionChain { + PartialReductionChain(Instruction *Reduction, Instruction *ExtendA, + Instruction *ExtendB, Instruction *BinOp) + : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) { + } + /// The top-level binary operation that forms the reduction to a scalar + /// after the loop body. + Instruction *Reduction; + /// The extension of each of the inner binary operation's operands. + Instruction *ExtendA; + Instruction *ExtendB; + + /// The binary operation using the extends that is then reduced. + Instruction *BinOp; +}; + /// Helper class to create VPRecipies from IR instructions. class VPRecipeBuilder { /// The VPlan new recipes are added to. @@ -34,6 +54,9 @@ class VPRecipeBuilder { /// Target Library Info. const TargetLibraryInfo *TLI; + // Target Transform Info. + const TargetTransformInfo *TTI; + /// The legality analysis. LoopVectorizationLegality *Legal; @@ -63,6 +86,11 @@ class VPRecipeBuilder { /// created. SmallVector PhisToFix; + /// The set of reduction exit instructions that will be scaled to + /// a smaller VF via partial reductions, paired with the scaling factor. + DenseMap> + ScaledReductionExitInstrs; + /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. @@ -111,13 +139,35 @@ class VPRecipeBuilder { VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, ArrayRef Operands); + /// Examines reduction operations to see if the target can use a cheaper + /// operation with a wider per-iteration input VF and narrower PHI VF. + /// Returns null if no scaled reduction was found, otherwise a pair with a + /// struct containing reduction information and the scaling factor between the + /// number of elements in the input and output. + std::optional> + getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx, + VFRange &Range); + public: VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, PredicatedScalarEvolution &PSE, VPBuilder &Builder) - : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), - PSE(PSE), Builder(Builder) {} + : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), + CM(CM), PSE(PSE), Builder(Builder) {} + + std::optional> + getScaledReductionForInstr(const Instruction *ExitInst) { + auto It = ScaledReductionExitInstrs.find(ExitInst); + return It == ScaledReductionExitInstrs.end() + ? std::nullopt + : std::make_optional(It->second); + } + + /// Find all possible partial reductions in the loop and track all of those + /// that are valid so recipes can be formed later. + void collectScaledReductions(VFRange &Range); /// Create and return a widened recipe for \p I if one can be created within /// the given VF \p Range. @@ -125,6 +175,11 @@ class VPRecipeBuilder { ArrayRef Operands, VFRange &Range, VPBasicBlock *VPBB); + /// Create and return a partial reduction recipe for a reduction instruction + /// along with binary operation and reduction phi operands. + VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, + ArrayRef Operands); + /// Set the recipe created for given ingredient. void setRecipe(Instruction *I, VPRecipeBase *R) { assert(!Ingredient2Recipe.contains(I) && diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cfbb4ad32d681..1da185f9cfdf4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -883,6 +883,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: case VPRecipeBase::VPScalarCastSC: + case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: @@ -2384,23 +2385,28 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// The phi is part of an ordered reduction. Requires IsInLoop to be true. bool IsOrdered; + /// When expanding the reduction PHI, the plan's VF element count is divided + /// by this factor to form the reduction phi's VF. + unsigned VFScaleFactor = 1; + public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, - bool IsOrdered = false) + bool IsOrdered = false, unsigned VFScaleFactor = 1) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), - RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { + RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), + VFScaleFactor(VFScaleFactor) { assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); } ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { - auto *R = - new VPReductionPHIRecipe(cast(getUnderlyingInstr()), RdxDesc, - *getOperand(0), IsInLoop, IsOrdered); + auto *R = new VPReductionPHIRecipe(cast(getUnderlyingInstr()), + RdxDesc, *getOperand(0), IsInLoop, + IsOrdered, VFScaleFactor); R->addOperand(getBackedgeValue()); return R; } @@ -2431,6 +2437,51 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, bool isInLoop() const { return IsInLoop; } }; +/// A recipe for forming partial reductions. In the loop, an accumulator and +/// vector operand are added together and passed to the next iteration as the +/// next accumulator. After the loop body, the accumulator is reduced to a +/// scalar value. +class VPPartialReductionRecipe : public VPSingleDefRecipe { + unsigned Opcode; + +public: + VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, + VPValue *Op1) + : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, + ReductionInst) {} + VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, + Instruction *ReductionInst = nullptr) + : VPSingleDefRecipe(VPDef::VPPartialReductionSC, + ArrayRef({Op0, Op1}), ReductionInst), + Opcode(Opcode) { + assert(isa(getOperand(1)->getDefiningRecipe()) && + "Unexpected operand order for partial reduction recipe"); + } + ~VPPartialReductionRecipe() override = default; + + VPPartialReductionRecipe *clone() override { + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1)); + } + + VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) + + /// Generate the reduction in the loop. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPPartialReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Get the binary op's opcode. + unsigned getOpcode() const { return Opcode; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPSingleDefRecipe { @@ -2640,7 +2691,7 @@ class VPReductionRecipe : public VPSingleDefRecipe { return R && classof(R); } - /// Generate the reduction in the loop + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; /// Return the cost of VPReductionRecipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 35497a7431f76..8fea2c6fd33b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -231,10 +231,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case( - [this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe, + VPPartialReductionRecipe>([this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e54df8bdeac55..4057a51155ece 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -277,6 +277,72 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF, llvm_unreachable("subclasses should implement computeCost"); } +InstructionCost +VPPartialReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + std::optional Opcode = std::nullopt; + VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); + if (auto *WidenR = dyn_cast(BinOpR)) + Opcode = std::make_optional(WidenR->getOpcode()); + + VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe(); + VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe(); + + auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); + auto *InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0) + : BinOpR->getOperand(0)); + auto *InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0) + : BinOpR->getOperand(1)); + + auto GetExtendKind = [](VPRecipeBase *R) { + // The extend could come from outside the plan. + if (!R) + return TargetTransformInfo::PR_None; + auto *WidenCastR = dyn_cast(R); + if (!WidenCastR) + return TargetTransformInfo::PR_None; + if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) + return TargetTransformInfo::PR_ZeroExtend; + if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) + return TargetTransformInfo::PR_SignExtend; + return TargetTransformInfo::PR_None; + }; + + return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, + PhiType, VF, GetExtendKind(ExtAR), + GetExtendKind(ExtBR), Opcode); +} + +void VPPartialReductionRecipe::execute(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); + auto &Builder = State.Builder; + + assert(getOpcode() == Instruction::Add && + "Unhandled partial reduction opcode"); + + Value *BinOpVal = State.get(getOperand(0)); + Value *PhiVal = State.get(getOperand(1)); + assert(PhiVal && BinOpVal && "Phi and Mul must be set"); + + Type *RetTy = PhiVal->getType(); + + CallInst *V = Builder.CreateIntrinsic( + RetTy, Intrinsic::experimental_vector_partial_reduce_add, + {PhiVal, BinOpVal}, nullptr, "partial.reduce"); + + State.set(this, V); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "PARTIAL-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = " << Instruction::getOpcodeName(getOpcode()) << " "; + printOperands(O, SlotTracker); +} +#endif + FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -3356,6 +3422,10 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, void VPReductionPHIRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; + // If this phi is fed by a scaled reduction then it should output a + // vector with fewer elements than the VF. + ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); + // Reductions do not have to start at zero. They can start with // any loop invariant values. VPValue *StartVPV = getStartValue(); @@ -3366,8 +3436,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // stage #1: We create a new vector PHI node with no incoming edges. We'll use // this value when we vectorize all of the instructions that use the PHI. bool ScalarPHI = State.VF.isScalar() || IsInLoop; - Type *VecTy = ScalarPHI ? StartV->getType() - : VectorType::get(StartV->getType(), State.VF); + Type *VecTy = + ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF); BasicBlock *HeaderBB = State.CFG.PrevBB; assert(State.CurrentParentLoop->getHeader() == HeaderBB && @@ -3417,13 +3487,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) { // Create start and identity vector values for the reduction in the // preheader. // TODO: Introduce recipes in VPlan preheader to create initial values. - Iden = Builder.CreateVectorSplat(State.VF, Iden); + Iden = Builder.CreateVectorSplat(VF, Iden); IRBuilderBase::InsertPointGuard IPBuilder(Builder); Builder.SetInsertPoint(VectorPH->getTerminator()); Constant *Zero = Builder.getInt32(0); StartV = Builder.CreateInsertElement(Iden, StartV, Zero); } else { - Iden = Builder.CreateVectorSplat(State.VF, Iden); + Iden = Builder.CreateVectorSplat(VF, Iden); } } } @@ -3441,6 +3511,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = phi "; printOperands(O, SlotTracker); + if (VFScaleFactor != 1) + O << " (VF scaled by 1/" << VFScaleFactor << ")"; } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 957a602091c73..7aaf4002b8b3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -329,6 +329,7 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPPartialReductionSC, VPReplicateSC, VPScalarCastSC, VPScalarIVStepsSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 1cfb507a74344..c3e8c895fce24 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 26 +; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 56 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body @@ -31,8 +31,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %mul, %sum + %div = udiv i64 %conv3, %conv + %add = add i64 %div, %sum %i.iv.next = add nuw nsw i64 %i.iv, 1 %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 26 +; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 49 +; CHECK: Cost for VF 16: 57 ; CHECK: LV: Selecting VF: vscale x 2 entry: br label %for.body @@ -64,8 +64,8 @@ for.body: ; preds = %entry, %for.body %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next %1 = load i8, ptr %arrayidx2, align 1 %conv3 = zext i8 %1 to i64 - %mul = mul nuw nsw i64 %conv3, %conv - %add = add i64 %sum, %mul + %div = udiv i64 %conv3, %conv + %add = add i64 %sum, %div %exitcond.not = icmp eq i64 %i.iv.next, 16 br i1 %exitcond.not, label %exit, label %for.body @@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 27 +; CHECK: Cost for VF 8: 24 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 48 +; CHECK: Cost for VF 16: 42 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll new file mode 100644 index 0000000000000..5cc00daab7ce5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-LABEL: define i32 @dotp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = zext [[WIDE_LOAD5]] to +; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], [[TMP22]] +; CHECK-NEXT: [[TMP27]] = add [[TMP26]], [[VEC_PHI3]] +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP27]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { +; CHECK-LABEL: define void @dotp_small_epilogue_vf( +; CHECK-SAME: i64 [[IDX_NEG:%.*]], i8 [[A:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1, [[IDX_NEG]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i64> +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[ADD:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT7]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT8]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i64> zeroinitializer, i64 [[ACCUM]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <1 x i64> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT12]] to <8 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = mul <8 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[PARTIAL_REDUCE13]] = call <1 x i64> @llvm.experimental.vector.partial.reduce.add.v1i64.v8i64(<1 x i64> [[VEC_PHI10]], <8 x i64> [[TMP11]]) +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> [[PARTIAL_REDUCE13]]) +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IDX_NEG]], [[ITER_CHECK:%.*]] ], [ [[IND_END6]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IV_NEXT]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[ADD]], [[WHILE_BODY]] ] +; CHECK-NEXT: br label [[WHILE_BODY1:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[IV_NEG:%.*]] = phi i64 [ [[IV_NEG_NEXT:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[ACCUM1:%.*]] = phi i64 [ [[ADD1:%.*]], [[WHILE_BODY1]] ], [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[IV_NEG_NEXT]] = add i64 [[IV_NEG]], 1 +; CHECK-NEXT: [[EXT_A:%.*]] = sext i8 [[A]] to i64 +; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 +; CHECK-NEXT: [[B:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = sext i8 [[B]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD1]] = add i64 [[MUL]], [[ACCUM1]] +; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 +; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[IV1]], -1 +; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[RESULT:%.*]] = phi i64 [ [[ADD1]], [[WHILE_BODY1]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret void +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %iv.neg = phi i64 [ %iv.neg.next, %while.body ], [ %idx.neg, %entry ] + %iv = phi i64 [ %iv.next, %while.body ], [ 0, %entry ] + %accum = phi i64 [ %add, %while.body ], [ 0, %entry ] + %iv.neg.next = add i64 %iv.neg, 1 + %ext.a = sext i8 %a to i64 + %iv.next = add i64 %iv, 1 + %b = load i8, ptr null, align 1 + %ext.b = sext i8 %b to i64 + %mul = mul i64 %ext.b, %ext.a + %add = add i64 %mul, %accum + %cmp.iv.neg = icmp ugt i64 %iv.neg, 0 + %cmp.iv = icmp ne i64 %iv, -1 + %exitcond = and i1 %cmp.iv.neg, %cmp.iv + br i1 %exitcond, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %result = phi i64 [ %add, %while.body ] + ret void +} + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +attributes #1 = { "target-cpu"="apple-m1" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll new file mode 100644 index 0000000000000..74db8683d5df8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -0,0 +1,206 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+i8mm,+dotprod -S < %s | FileCheck %s +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+dotprod -S < %s | FileCheck %s --check-prefix=CHECK-NOI8MM + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp_z_s(ptr %a, ptr %b) #0 { +; CHECK-LABEL: define i32 @dotp_z_s( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-NOI8MM-LABEL: define i32 @dotp_z_s( +; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NOI8MM-NEXT: entry: +; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM: vector.ph: +; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NOI8MM: vector.body: +; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM: middle.block: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_s_z(ptr %a, ptr %b) #0 { +; CHECK-LABEL: define i32 @dotp_s_z( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-NOI8MM-LABEL: define i32 @dotp_s_z( +; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NOI8MM-NEXT: entry: +; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM: vector.ph: +; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NOI8MM: vector.body: +; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM: middle.block: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll new file mode 100644 index 0000000000000..c66695f1b50f0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -0,0 +1,1375 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_different_types(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-MAXBW-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i16, ptr %gep.b, align 2 + %ext.b = zext i16 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_phi(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( +; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] + %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] + %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] + %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] + %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv + %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv + %offset.1 = or disjoint i64 %iv, 1 + %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 + %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 + %offset.2 = or disjoint i64 %iv, 2 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 + %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 + %offset.3 = or disjoint i64 %iv, 3 + %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 + %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 + %load.a0 = load i8, ptr %gep.a0, align 1 + %ext.a0 = sext i8 %load.a0 to i32 + %load.b0 = load i8, ptr %gep.b0, align 1 + %ext.b0 = sext i8 %load.b0 to i32 + %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 + %add.a0 = add nsw i32 %mul.a0, %accum0 + %load.a1 = load i8, ptr %gep.a1, align 1 + %ext.a1 = sext i8 %load.a1 to i32 + %load.b1 = load i8, ptr %gep.b1, align 1 + %ext.b1 = sext i8 %load.b1 to i32 + %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 + %add.a1 = add nsw i32 %mul.a1, %accum1 + %load.a2 = load i8, ptr %gep.a2, align 1 + %ext.a2 = sext i8 %load.a2 to i32 + %load.b2 = load i8, ptr %gep.b2, align 1 + %ext.b2 = sext i8 %load.b2 to i32 + %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 + %add.a2 = add nsw i32 %mul.a2, %accum2 + %load.a3 = load i8, ptr %gep.a3, align 1 + %ext.a3 = sext i8 %load.a3 to i32 + %load.b3 = load i8, ptr %gep.b3, align 1 + %ext.b3 = sext i8 %load.b3 to i32 + %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 + %add.a3 = add nsw i32 %mul.a3, %accum3 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %num_in + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %result0 = add nsw i32 %add.a0, %add.a1 + %result1 = add nsw i32 %add.a2, %add.a3 + %result = add nsw i32 %result0, %result1 + ret i32 %result +} + +define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] +; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv + %load.b = load i8, ptr %gep.a2, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_extend_user(ptr %a, ptr %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %result = add i32 %add, %ext.b + ret i32 %result +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll new file mode 100644 index 0000000000000..af2a7b966f700 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -0,0 +1,2164 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP18]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP29]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP30]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 17 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 18 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 19 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 20 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 21 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 22 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 23 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 24 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 25 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = add i64 [[INDEX]], 26 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 27 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], 28 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 29 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1 +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2 +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2 +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2 +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2 +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2 +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2 +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2 +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2 +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2 +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2 +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2 +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2 +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2 +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2 +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15 +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] +; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i16, ptr %gep.b, align 2 + %ext.b = zext i16 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP16]], i32 -1) +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP17]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul [[TMP22]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[TMP24]], [[TMP25]], i32 -1) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-MAXBW-NEXT: [[TMP25]] = mul [[TMP23]], [[TMP16]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call @llvm.vector.splice.nxv8i32( [[VECTOR_RECUR]], [[TMP25]], i32 -1) +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul [[TMP15]], [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add [[TMP16]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul [[TMP22]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add [[TMP30]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 0, i32 [[TMP8]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP21]] = add [[TMP20]], [[TMP19]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = mul nsw [[TMP21]], [[TMP36]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23]] = add [[TMP38]], [[VEC_PHI3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD5]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw [[TMP25]], [[TMP42]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add [[TMP28]], [[VEC_PHI2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext [[WIDE_LOAD7]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw [[TMP31]], [[TMP33]] +; CHECK-INTERLEAVE1-NEXT: [[TMP35]] = add [[TMP34]], [[VEC_PHI1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP27]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD10]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = mul nsw [[TMP37]], [[TMP39]] +; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add [[TMP40]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( +; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP56]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP72]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = sext [[WIDE_LOAD10]] to +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw [[TMP28]], [[TMP66]] +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw [[TMP82]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP50]] = add [[TMP30]], [[VEC_PHI6]] +; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add [[TMP31]], [[VEC_PHI7]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP37]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext [[WIDE_LOAD11]] to +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP43]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext [[WIDE_LOAD13]] to +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext [[WIDE_LOAD14]] to +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw [[TMP38]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw [[TMP39]], [[TMP45]] +; CHECK-INTERLEAVED-NEXT: [[TMP48]] = add [[TMP46]], [[VEC_PHI4]] +; CHECK-INTERLEAVED-NEXT: [[TMP49]] = add [[TMP47]], [[VEC_PHI5]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load , ptr [[TMP53]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext [[WIDE_LOAD15]] to +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext [[WIDE_LOAD16]] to +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = mul i64 [[TMP57]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD17:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP59]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext [[WIDE_LOAD17]] to +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext [[WIDE_LOAD18]] to +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw [[TMP54]], [[TMP60]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw [[TMP55]], [[TMP61]] +; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add [[TMP62]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add [[TMP63]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = mul i64 [[TMP67]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load , ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP69]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = sext [[WIDE_LOAD19]] to +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD20]] to +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = mul i64 [[TMP73]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD22:%.*]] = load , ptr [[TMP75]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = sext [[WIDE_LOAD21]] to +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = sext [[WIDE_LOAD22]] to +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = mul nsw [[TMP70]], [[TMP76]] +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = mul nsw [[TMP71]], [[TMP77]] +; CHECK-INTERLEAVED-NEXT: [[TMP80]] = add [[TMP78]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add [[TMP79]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]] +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( +; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP24]], align 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw [[TMP29]], [[TMP23]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI7]], [[TMP31]]) +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP32]], align 1 +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP38]], align 1 +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext [[WIDE_LOAD14]] to +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw [[TMP37]], [[TMP43]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI6]], [[TMP45]]) +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP46]], align 1 +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext [[WIDE_LOAD18]] to +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP52]], align 1 +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext [[WIDE_LOAD20]] to +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw [[TMP51]], [[TMP57]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI5]], [[TMP59]]) +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load , ptr [[TMP60]], align 1 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext [[WIDE_LOAD24]] to +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load , ptr [[TMP66]], align 1 +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD26]] to +; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw [[TMP65]], [[TMP71]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI4]], [[TMP73]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] + %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] + %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] + %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] + %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv + %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv + %offset.1 = or disjoint i64 %iv, 1 + %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 + %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 + %offset.2 = or disjoint i64 %iv, 2 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 + %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 + %offset.3 = or disjoint i64 %iv, 3 + %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 + %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 + %load.a0 = load i8, ptr %gep.a0, align 1 + %ext.a0 = sext i8 %load.a0 to i32 + %load.b0 = load i8, ptr %gep.b0, align 1 + %ext.b0 = sext i8 %load.b0 to i32 + %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 + %add.a0 = add nsw i32 %mul.a0, %accum0 + %load.a1 = load i8, ptr %gep.a1, align 1 + %ext.a1 = sext i8 %load.a1 to i32 + %load.b1 = load i8, ptr %gep.b1, align 1 + %ext.b1 = sext i8 %load.b1 to i32 + %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 + %add.a1 = add nsw i32 %mul.a1, %accum1 + %load.a2 = load i8, ptr %gep.a2, align 1 + %ext.a2 = sext i8 %load.a2 to i32 + %load.b2 = load i8, ptr %gep.b2, align 1 + %ext.b2 = sext i8 %load.b2 to i32 + %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 + %add.a2 = add nsw i32 %mul.a2, %accum2 + %load.a3 = load i8, ptr %gep.a3, align 1 + %ext.a3 = sext i8 %load.a3 to i32 + %load.b3 = load i8, ptr %gep.b3, align 1 + %ext.b3 = sext i8 %load.b3 to i32 + %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 + %add.a3 = add nsw i32 %mul.a3, %accum3 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %num_in + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %result0 = add nsw i32 %add.a0, %add.a1 + %result1 = add nsw i32 %add.a2, %add.a3 + %result = add nsw i32 %result0, %result1 + ret i32 %result +} + +define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP10]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP15]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-INTERLEAVE1-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-INTERLEAVED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-MAXBW-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv + %load.b = load i8, ptr %gep.a2, align 1 + %ext.b = sext i8 %load.b to i32 + %mul = mul nsw i32 %ext.b, %ext.a + %add = add nsw i32 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7 + +exit: ; preds = %for.body + ret i32 %add +} + +define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %result = add i32 %add, %ext.b + ret i32 %result +} + +define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw [[TMP20]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw [[TMP21]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[VEC_PHI]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI1]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64( [[VEC_PHI]], [[TMP14]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] + %sum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i64 + %i.iv.next = add nuw nsw i64 %i.iv, 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next + %1 = load i8, ptr %arrayidx2, align 1 + %conv3 = zext i8 %1 to i64 + %mul = mul nuw nsw i64 %conv3, %conv + %add = add i64 %sum, %mul + %exitcond.not = icmp eq i64 %i.iv.next, 16 + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + ret i64 %add +} + +define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define void @not_dotp_not_phi2( +; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-INTERLEAVE1: for.preheader: +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr null, align 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVE1-NEXT: [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_1]] = add i32 [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVE1-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; +; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi2( +; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-INTERLEAVED: for.preheader: +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr null, align 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVED-NEXT: [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], 16 +; CHECK-INTERLEAVED-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext i8 [[TMP8]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sext i8 [[TMP9]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nsw i32 [[A_EXT]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nsw i32 [[A_EXT]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext i8 [[TMP16]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext i8 [[TMP17]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP18]] +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw i32 [[A_EXT1]], [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add i32 [[TMP20]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; +; CHECK-MAXBW-LABEL: define void @not_dotp_not_phi2( +; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[FOR_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-MAXBW: for.preheader: +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr null, align 1 +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load i8, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[A_EXT1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1 +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]] +; CHECK-MAXBW-NEXT: [[ADD_1]] = add i32 [[MUL_1]], [[ADD]] +; CHECK-MAXBW-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16 +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; +entry: + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %for.preheader, label %exit + +for.preheader: ; preds = %entry + %load.a = load i8, ptr inttoptr (i64 0 to ptr), align 1 + %load.a1 = load i8, ptr inttoptr (i64 1 to ptr), align 1 + %a.ext = sext i8 %load.a to i32 + %a.ext1 = sext i8 %load.a1 to i32 + br label %for.body + +for.body: ; preds = %for.preheader, %for.body + %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ] + %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ] + %accum = phi i32 [ %add.1, %for.body ], [ 0, %for.preheader ] + %gep.b = getelementptr i8, ptr %ptr, i64 1 + %gep.b1 = getelementptr i8, ptr %ptr, i64 2 + %load.b = load i8, ptr %gep.b, align 1 + %b.ext = sext i8 %load.b to i32 + %mul = mul nsw i32 %a.ext, %b.ext + %add = add i32 %mul, %accum + %load.b1 = load i8, ptr %gep.b1, align 1 + %b.ext1 = sext i8 %load.b1 to i32 + %mul.1 = mul nsw i32 %a.ext1, %b.ext1 + %add.1 = add i32 %mul.1, %add + %scevgep = getelementptr i8, ptr %ptr, i64 16 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv.next, %n + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %add.1.lcssa = phi i32 [ %add.1, %for.body ] + %add.float = sitofp i32 %add.1.lcssa to float + br label %exit + +exit: ; preds = %for.exit, %entry + %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ] + store float %result, ptr %matrix, align 4 + ret void +} + +define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVE1: for.ph: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP3]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVED: for.ph: +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-MAXBW: for.ph: +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw [[TMP9]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp = icmp eq i64 %n, 0 + br i1 %cmp, label %exit, label %for.ph + +for.ph: ; preds = %entry + %ext.b = zext i16 %b to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ] + %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv + %load.a = load i16, ptr %gep.a, align 2 + %ext.a = zext i16 %load.a to i64 + %mul = mul nuw nsw i64 %ext.a, %ext.b + %add = add i64 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %cmp.1 = icmp eq i64 %iv.next, %n + br i1 %cmp.1, label %exit, label %for.body + +exit: ; preds = %for.cond.cleanup.loopexit, %entry + %result = phi i64 [ 0, %entry ], [ %add, %for.body ] + ret i64 %result +} + +define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan2( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVE1: for.ph: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan2( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-INTERLEAVED: for.ph: +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <8 x i64> [[TMP6]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan2( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] +; CHECK-MAXBW: for.ph: +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw [[BROADCAST_SPLAT]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; +entry: + %cmp = icmp eq i64 %n, 0 + br i1 %cmp, label %exit, label %for.ph + +for.ph: ; preds = %entry + %ext.b = zext i16 %b to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %for.ph ], [ %add, %for.body ] + %gep.a = getelementptr inbounds nuw i16, ptr %a, i64 %iv + %load.a = load i16, ptr %gep.a, align 2 + %ext.a = zext i16 %load.a to i64 + %mul = mul nuw nsw i64 %ext.b, %ext.a + %add = add i64 %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %cmp.1 = icmp eq i64 %iv.next, %n + br i1 %cmp.1, label %exit, label %for.body + +exit: ; preds = %for.cond.cleanup.loopexit, %entry + %result = phi i64 [ 0, %entry ], [ %add, %for.body ] + ret i64 %result +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll new file mode 100644 index 0000000000000..f24b115ab9f99 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @not_dotp(ptr %a, ptr %b) { +; CHECK-LABEL: define i32 @not_dotp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] +; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll new file mode 100644 index 0000000000000..5dd9f8ff97cca --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -0,0 +1,94 @@ +; REQUIRES: asserts +; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; Tests for printing VPlans that are enabled under AArch64 + +define i32 @print_partial_reduction(ptr %a, ptr %b) { +; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<0> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> +; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> +; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT vp<%bc.resume.val> = resume-phi vp<[[VEC_TC]]>, ir<0> +; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32 +; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a +; CHECK-NEXT: IR %add = add i32 %mul, %accum +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0 +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 0 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret i32 %add +} From fe97054dcb12d3bdf0b4348f8af8f76f8aea68ff Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 13 Jan 2025 11:24:05 +0000 Subject: [PATCH 033/102] Handle leading underscores in update_cc_test_checks.py (#121800) For some ABIs `update_cc_test_checks.py` is unable to generate tests because of the mismatch between the mangled function names reported by clang's `-asd-dump` and the function names in LLVM IR. This patch fixes it by striping the leading underscore from the mangled name for global functions if the data layout string says they have one. --- .../Inputs/c-symbol-mangling.c | 1 - .../Inputs/c-symbol-mangling.c.expected | 16 +++++++++- llvm/utils/UpdateTestChecks/common.py | 16 ++++++++++ llvm/utils/update_cc_test_checks.py | 30 +++++++++++++++---- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c index 018f992640065..58feddeb6bea0 100644 --- a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c +++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c @@ -18,7 +18,6 @@ // UTC_ARGS: --enable #ifdef __arm__ -/// FIXME: UTC does not find this function, but can find all others. typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t; int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return a + b + c; diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected index 5d514f9d64c02..e17ce61db9c2b 100644 --- a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected +++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected @@ -18,8 +18,22 @@ // UTC_ARGS: --enable #ifdef __arm__ -/// FIXME: UTC does not find this function, but can find all others. typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t; +// THUMB-DARWIN-LABEL: @test_vaba_s8( +// THUMB-DARWIN-NEXT: entry: +// THUMB-DARWIN-NEXT: [[A_ADDR:%.*]] = alloca <8 x i8>, align 8 +// THUMB-DARWIN-NEXT: [[B_ADDR:%.*]] = alloca <8 x i8>, align 8 +// THUMB-DARWIN-NEXT: [[C_ADDR:%.*]] = alloca <8 x i8>, align 8 +// THUMB-DARWIN-NEXT: store <8 x i8> [[A:%.*]], ptr [[A_ADDR]], align 8 +// THUMB-DARWIN-NEXT: store <8 x i8> [[B:%.*]], ptr [[B_ADDR]], align 8 +// THUMB-DARWIN-NEXT: store <8 x i8> [[C:%.*]], ptr [[C_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[ADD:%.*]] = add <8 x i8> [[TMP0]], [[TMP1]] +// THUMB-DARWIN-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[C_ADDR]], align 8 +// THUMB-DARWIN-NEXT: [[ADD1:%.*]] = add <8 x i8> [[ADD]], [[TMP2]] +// THUMB-DARWIN-NEXT: ret <8 x i8> [[ADD1]] +// int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return a + b + c; } diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index e1cc02e1a608c..1a875c2b523e4 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -557,6 +557,10 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): UTC_AVOID = "NOTE: Do not autogenerate" UNUSED_NOTE = "NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:" +DATA_LAYOUT_RE = re.compile( + r"target\s+datalayout\s+=\s+\"(?P.+)\"$", flags=(re.M | re.S) +) + OPT_FUNCTION_RE = re.compile( r"^(\s*;\s*Function\sAttrs:\s(?P[\w\s():,]+?))?\s*define\s+(?P[^@]*)@(?P[\w.$-]+?)\s*" r"(?P\((\)|(.*?[\w.-]+?)\))[^{]*\{)\n(?P.*?)^\}$", @@ -651,6 +655,18 @@ def get_triple_from_march(march): return "x86" +def get_globals_name_prefix(raw_tool_output): + m = DATA_LAYOUT_RE.search(raw_tool_output) + if not m: + return None + data_layout = m.group("layout") + idx = data_layout.find("m:") + if idx < 0: + return None + ch = data_layout[idx + 2] + return "_" if ch == "o" or ch == "x" else None + + def apply_filters(line, filters): has_filter = False for f in filters: diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py index 3ffb07ddf6ad8..7a4796eaabb3b 100755 --- a/llvm/utils/update_cc_test_checks.py +++ b/llvm/utils/update_cc_test_checks.py @@ -34,7 +34,7 @@ } -def get_line2func_list(args, clang_args): +def get_line2func_list(args, clang_args, globals_name_prefix): ret = collections.defaultdict(list) # Use clang's JSON AST dump to get the mangled name json_dump_args = [args.clang] + clang_args + ["-fsyntax-only", "-o", "-"] @@ -122,6 +122,14 @@ def parse_clang_ast_json(node, loc, search): if search is None: search = spell mangled = node.get("mangledName", spell) + # Clang's AST dump includes the globals prefix, but when Clang emits + # LLVM IR this is not included and instead added as part of the asm + # output. Strip it from the mangled name of globals when needed + # (see DataLayout::getGlobalPrefix()). + if globals_name_prefix: + storage = node.get("storageClass", None) + if storage != "static" and mangled[0] == globals_name_prefix: + mangled = mangled[1:] ret[int(line) - 1].append((spell, mangled, search)) ast = json.loads(stdout) @@ -249,10 +257,10 @@ def config(): return args, parser -def get_function_body(builder, args, filename, clang_args, extra_commands, prefixes): +def get_function_body( + builder, args, filename, clang_args, extra_commands, prefixes, raw_tool_output +): # TODO Clean up duplication of asm/common build_function_body_dictionary - # Invoke external tool and extract function bodies. - raw_tool_output = common.invoke_tool(args.clang, clang_args, filename) for extra_command in extra_commands: extra_args = shlex.split(extra_command) with tempfile.NamedTemporaryFile() as f: @@ -383,13 +391,23 @@ def main(): common.debug("Extracted clang cmd: clang {}".format(clang_args)) common.debug("Extracted FileCheck prefixes: {}".format(prefixes)) + # Invoke external tool and extract function bodies. + raw_tool_output = common.invoke_tool(ti.args.clang, clang_args, ti.path) get_function_body( - builder, ti.args, ti.path, clang_args, extra_commands, prefixes + builder, + ti.args, + ti.path, + clang_args, + extra_commands, + prefixes, + raw_tool_output, ) # Invoke clang -Xclang -ast-dump=json to get mapping from start lines to # mangled names. Forward all clang args for now. - for k, v in get_line2func_list(ti.args, clang_args).items(): + for k, v in get_line2func_list( + ti.args, clang_args, common.get_globals_name_prefix(raw_tool_output) + ).items(): line2func_list[k].extend(v) func_dict = builder.finish_and_get_func_dict() From 1bd66437b117eb2cd8d7fd45cf9c0bee823f203c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 10 Jan 2025 17:52:06 +0000 Subject: [PATCH 034/102] [X86] subvectorwise-store-of-vector-splat.ll - regenerate VPTERNLOG comments --- .../subvectorwise-store-of-vector-splat.ll | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index f1fd05565c47e..df8a85fd07258 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -389,7 +389,7 @@ define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -452,7 +452,7 @@ define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -599,7 +599,7 @@ define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -694,7 +694,7 @@ define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec128_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -1003,7 +1003,7 @@ define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1079,7 +1079,7 @@ define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1355,7 +1355,7 @@ define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1550,7 +1550,7 @@ define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec256_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2170,7 +2170,7 @@ define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2258,7 +2258,7 @@ define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2722,7 +2722,7 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v3i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi) ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: movw %ax, (%rsi) @@ -3006,7 +3006,7 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v3i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -3664,7 +3664,7 @@ define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -3983,7 +3983,7 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v6i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -4420,7 +4420,7 @@ define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -5444,7 +5444,7 @@ define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5540,7 +5540,7 @@ define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5965,7 +5965,7 @@ define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -6363,7 +6363,7 @@ define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec512_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) From 3a22d74f3d5d8132dd3c92d2445707e466481e89 Mon Sep 17 00:00:00 2001 From: Eisuke Kawashima Date: Mon, 13 Jan 2025 21:00:35 +0900 Subject: [PATCH 035/102] [Polly] Use "is" instead of "==" to check for None (#94021) From PEP8 (https://peps.python.org/pep-0008/#programming-recommendations): > Comparisons to singletons like None should always be done with is or is not, never the equality operators. --- polly/lib/External/isl/interface/python.cc | 2 +- polly/lib/External/isl/libisl-gdb.py | 4 ++-- polly/lib/External/isl/python/isl.py.top | 4 ++-- polly/test/lit.site.cfg.in | 2 +- polly/utils/pyscop/isl.py | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/polly/lib/External/isl/interface/python.cc b/polly/lib/External/isl/interface/python.cc index e4a8288631297..b60bf315ca703 100644 --- a/polly/lib/External/isl/interface/python.cc +++ b/polly/lib/External/isl/interface/python.cc @@ -347,7 +347,7 @@ static void print_persistent_callback_failure_check(int indent, printf(fmt, 0); printf(", '%s') and ", callback_name.c_str()); printf(fmt, 0); - printf(".%s['exc_info'] != None:\n", callback_name.c_str()); + printf(".%s['exc_info'] is not None:\n", callback_name.c_str()); print_indent(indent, " exc_info = "); printf(fmt, 0); printf(".%s['exc_info'][0]\n", callback_name.c_str()); diff --git a/polly/lib/External/isl/libisl-gdb.py b/polly/lib/External/isl/libisl-gdb.py index bf01bc583d15d..bdd3949cf89c0 100644 --- a/polly/lib/External/isl/libisl-gdb.py +++ b/polly/lib/External/isl/libisl-gdb.py @@ -70,7 +70,7 @@ def invoke(self, arg, from_tty): arg = gdb.parse_and_eval(arg) printer = str_lookup_function(arg) - if printer == None: + if printer is None: print("No isl printer for this type") return @@ -90,7 +90,7 @@ def str_lookup_function(val): lookup_tag = val.type.target() regex = re.compile("^isl_(.*)$") - if lookup_tag == None: + if lookup_tag is None: return None m = regex.match(str(lookup_tag)) diff --git a/polly/lib/External/isl/python/isl.py.top b/polly/lib/External/isl/python/isl.py.top index d041315d4e11d..9dc47a1a83251 100644 --- a/polly/lib/External/isl/python/isl.py.top +++ b/polly/lib/External/isl/python/isl.py.top @@ -3,7 +3,7 @@ from ctypes import * from ctypes.util import find_library isl_dyld_library_path = os.environ.get('ISL_DYLD_LIBRARY_PATH') -if isl_dyld_library_path != None: +if isl_dyld_library_path is not None: os.environ['DYLD_LIBRARY_PATH'] = isl_dyld_library_path try: isl = cdll.LoadLibrary(isl_dlname) @@ -29,7 +29,7 @@ class Context: @staticmethod def getDefaultInstance(): - if Context.defaultInstance == None: + if Context.defaultInstance is None: Context.defaultInstance = Context() return Context.defaultInstance diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in index d8a0b6ae3a3b2..f22063e796def 100644 --- a/polly/test/lit.site.cfg.in +++ b/polly/test/lit.site.cfg.in @@ -14,7 +14,7 @@ config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";") ## Check the current platform with regex import re EAT_ERR_ON_X86 = ' ' -if (re.match(r'^x86_64*', '@LLVM_TARGET_TRIPLE@') == None) : +if (re.match(r'^x86_64*', '@LLVM_TARGET_TRIPLE@') is None) : EAT_ERR_ON_X86 = '|| echo \"error is eaten\"' for arch in config.targets_to_build.split(): diff --git a/polly/utils/pyscop/isl.py b/polly/utils/pyscop/isl.py index 5eaf7798e20b9..c06b7bca28042 100644 --- a/polly/utils/pyscop/isl.py +++ b/polly/utils/pyscop/isl.py @@ -24,7 +24,7 @@ def from_ptr(ptr): @staticmethod def getDefaultInstance(): - if Context.defaultInstance == None: + if Context.defaultInstance is None: Context.defaultInstance = Context() return Context.defaultInstance @@ -33,12 +33,12 @@ def getDefaultInstance(): class IslObject: def __init__(self, string="", ctx=None, ptr=None): self.initialize_isl_methods() - if ptr != None: + if ptr is not None: self.ptr = ptr self.ctx = self.get_isl_method("get_ctx")(self) return - if ctx == None: + if ctx is None: ctx = Context.getDefaultInstance() self.ctx = ctx @@ -236,7 +236,7 @@ class Printer: FORMAT_EXT_POLYLIB = 6 def __init__(self, ctx=None): - if ctx == None: + if ctx is None: ctx = Context.getDefaultInstance() self.ctx = ctx From 0dc75c6e947efdb654212e6039a11a60b10f2a0b Mon Sep 17 00:00:00 2001 From: Eisuke Kawashima Date: Mon, 13 Jan 2025 21:03:04 +0900 Subject: [PATCH 036/102] [cross-project-tests] Use "is" instead of "==" to check for None (#94016) From PEP8 (https://peps.python.org/pep-0008/#programming-recommendations): > Comparisons to singletons like None should always be done with is or is not, never the equality operators. --- .../debuginfo-tests/dexter/dex/command/ParseCommand.py | 2 +- .../dex/debugger/DebuggerControllers/ConditionalController.py | 4 ++-- .../dex/debugger/DebuggerControllers/ControllerHelpers.py | 2 +- .../debuginfo-tests/dexter/dex/debugger/Debuggers.py | 2 +- .../dexter/dex/debugger/visualstudio/VisualStudio.py | 2 +- .../debuginfo-tests/dexter/dex/tools/test/Tool.py | 2 +- cross-project-tests/lit.cfg.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py index 29d7867e80867..4b086e14d4050 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py @@ -98,7 +98,7 @@ def _build_command( def label_to_line(label_name: str) -> int: line = labels.get(label_name, None) - if line != None: + if line is not None: return line raise format_unresolved_label_err(label_name, raw_text, path.base, lineno) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py index a7d6b570b55e8..ac3054c3a0edf 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ConditionalController.py @@ -62,7 +62,7 @@ def __init__( self.finish_on_remove = finish_on_remove def has_conditions(self): - return self.expression != None + return self.expression is not None def get_conditional_expression_list(self): conditional_list = [] @@ -76,7 +76,7 @@ def add_hit(self): self.current_hit_count += 1 def should_be_removed(self): - if self.max_hit_count == None: + if self.max_hit_count is None: return False return self.current_hit_count >= self.max_hit_count diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py index 3e5a7b919d703..a4ca5ae0158e9 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ControllerHelpers.py @@ -39,7 +39,7 @@ def update_step_watches(step_info, watches, commands): for watch in towatch: loc = step_info.current_location if ( - loc.path != None + loc.path is not None and os.path.exists(loc.path) and os.path.samefile(watch.path, loc.path) and have_hit_line(watch, loc) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py index 1b0d4d5871cbe..67b715af78698 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py @@ -183,7 +183,7 @@ def handle_debugger_tool_options(context, defaults): # noqa if options.debugger == "lldb": _warn_meaningless_option(context, "--show-debugger") - if options.source_root_dir != None: + if options.source_root_dir is not None: if not os.path.isabs(options.source_root_dir): raise ToolArgumentError( f'--source-root-dir: expected absolute path, got "{options.source_root_dir}"' diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py index a6752274efac2..a7f12cde1f047 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py @@ -256,7 +256,7 @@ def delete_breakpoints(self, ids): for bp in self._debugger.Breakpoints: # We're looking at the user-set breakpoints so there should be no # Parent. - assert bp.Parent == None + assert bp.Parent is None this_vsbp = VSBreakpoint( PurePath(bp.File), bp.FileLine, bp.FileColumn, bp.Condition ) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py index f07641041254b..c366062cec7a9 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py @@ -150,7 +150,7 @@ def _get_results_path(self, test_name): """Returns the path to the test results directory for the test denoted by test_name. """ - assert self.context.options.results_directory != None + assert self.context.options.results_directory is not None return os.path.join( self.context.options.results_directory, self._get_results_basename(test_name), diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py index 9935fe6a199da..c2a8bcef26cbf 100644 --- a/cross-project-tests/lit.cfg.py +++ b/cross-project-tests/lit.cfg.py @@ -51,7 +51,7 @@ def get_required_attr(config, attr_name): attr_value = getattr(config, attr_name, None) - if attr_value == None: + if attr_value is None: lit_config.fatal( "No attribute %r in test configuration! You may need to run " "tests from your build directory or add this attribute " From 9baef02cb4c77bf861dd6a6e9f8a2f0f6cab44f2 Mon Sep 17 00:00:00 2001 From: Eisuke Kawashima Date: Mon, 13 Jan 2025 21:05:10 +0900 Subject: [PATCH 037/102] [Polly] Fix invalid escape sequences (#94037) These generate a SyntaxWarning since Python 3.12. --- polly/test/update_check.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/polly/test/update_check.py b/polly/test/update_check.py index 88d95c247c063..a973c72ff4e78 100644 --- a/polly/test/update_check.py +++ b/polly/test/update_check.py @@ -222,7 +222,12 @@ def classyfier2(lines): line = i.__next__() -replrepl = {"{{": "{{[{][{]}}", "}}": "{{[}][}]}}", "[[": "{{\[\[}}", "]]": "{{\]\]}}"} +replrepl = { + "{{": "{{[{][{]}}", + "}}": "{{[}][}]}}", + "[[": r"{{\[\[}}", + "]]": r"{{\]\]}}", +} replre = re.compile("|".join(re.escape(k) for k in replrepl.keys())) @@ -452,7 +457,7 @@ def main(): checkre = re.compile( r"^\s*\;\s*(" + "|".join([re.escape(s) for s in checkprefixes]) - + ")(\-NEXT|\-DAG|\-NOT|\-LABEL|\-SAME)?\s*\:" + + r")(\-NEXT|\-DAG|\-NOT|\-LABEL|\-SAME)?\s*\:" ) firstcheckline = None firstnoncommentline = None From 75e2396f28e430f35d7da16ead9fe523e30e8c19 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 13 Jan 2025 09:20:50 +0000 Subject: [PATCH 038/102] [AArch64] Fix chain for calls from agnostic-ZA functions. The lowering code was using the wrong chain value, which meant that the 'smstart' after the call from streaming agnostic-ZA functions -> non-streaming private-ZA functions was incorrectly removed from the DAG. --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 118 ++++++++++++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d9877fef1437c..278dd95cd969d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9664,7 +9664,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getConstant(0, DL, MVT::i64)); TPIDR2.Uses++; } else if (RequiresSaveAllZA) { - Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain, + Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result, /*IsSave=*/false); } diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 97522b9a319c0..1f68815411097 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -82,3 +82,121 @@ define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "a %res = call i64 @agnostic_decl(i64 %v) ret i64 %res } + +; agnostic-ZA + streaming -> private-ZA + non-streaming +define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: mov x0, x9 +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x20, sp +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i64 @private_za_decl(i64 %v) + %res2 = call i64 @private_za_decl(i64 %res) + ret i64 %res2 +} + +; agnostic-ZA + streaming-compatible -> private-ZA + non-streaming +define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: mov x0, x9 +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz w20, #0, .LBB5_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x2, x0 +; CHECK-NEXT: tbz w20, #0, .LBB5_4 +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: and x20, x0, #0x1 +; CHECK-NEXT: tbz w20, #0, .LBB5_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB5_6: +; CHECK-NEXT: mov x0, x2 +; CHECK-NEXT: bl private_za_decl +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: tbz w20, #0, .LBB5_8 +; CHECK-NEXT: // %bb.7: +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB5_8: +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEXT: ret + %res = call i64 @private_za_decl(i64 %v) + %res2 = call i64 @private_za_decl(i64 %res) + ret i64 %res2 +} From cea8dc37078baff8276c8ea722e6dbfa16c250dc Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 13 Jan 2025 13:11:39 +0100 Subject: [PATCH 039/102] [Polly] Revert changes to isl Python code This partially reverts b605dab7a8352158ee0d399b8c3433f9a8b495a3, dropping the changes to isl. This is an external library, so we shouldn't modify it unless strictly necessary. --- polly/lib/External/isl/interface/python.cc | 2 +- polly/lib/External/isl/libisl-gdb.py | 4 ++-- polly/lib/External/isl/python/isl.py.top | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/polly/lib/External/isl/interface/python.cc b/polly/lib/External/isl/interface/python.cc index b60bf315ca703..e4a8288631297 100644 --- a/polly/lib/External/isl/interface/python.cc +++ b/polly/lib/External/isl/interface/python.cc @@ -347,7 +347,7 @@ static void print_persistent_callback_failure_check(int indent, printf(fmt, 0); printf(", '%s') and ", callback_name.c_str()); printf(fmt, 0); - printf(".%s['exc_info'] is not None:\n", callback_name.c_str()); + printf(".%s['exc_info'] != None:\n", callback_name.c_str()); print_indent(indent, " exc_info = "); printf(fmt, 0); printf(".%s['exc_info'][0]\n", callback_name.c_str()); diff --git a/polly/lib/External/isl/libisl-gdb.py b/polly/lib/External/isl/libisl-gdb.py index bdd3949cf89c0..bf01bc583d15d 100644 --- a/polly/lib/External/isl/libisl-gdb.py +++ b/polly/lib/External/isl/libisl-gdb.py @@ -70,7 +70,7 @@ def invoke(self, arg, from_tty): arg = gdb.parse_and_eval(arg) printer = str_lookup_function(arg) - if printer is None: + if printer == None: print("No isl printer for this type") return @@ -90,7 +90,7 @@ def str_lookup_function(val): lookup_tag = val.type.target() regex = re.compile("^isl_(.*)$") - if lookup_tag is None: + if lookup_tag == None: return None m = regex.match(str(lookup_tag)) diff --git a/polly/lib/External/isl/python/isl.py.top b/polly/lib/External/isl/python/isl.py.top index 9dc47a1a83251..d041315d4e11d 100644 --- a/polly/lib/External/isl/python/isl.py.top +++ b/polly/lib/External/isl/python/isl.py.top @@ -3,7 +3,7 @@ from ctypes import * from ctypes.util import find_library isl_dyld_library_path = os.environ.get('ISL_DYLD_LIBRARY_PATH') -if isl_dyld_library_path is not None: +if isl_dyld_library_path != None: os.environ['DYLD_LIBRARY_PATH'] = isl_dyld_library_path try: isl = cdll.LoadLibrary(isl_dlname) @@ -29,7 +29,7 @@ class Context: @staticmethod def getDefaultInstance(): - if Context.defaultInstance is None: + if Context.defaultInstance == None: Context.defaultInstance = Context() return Context.defaultInstance From 12a759133c539f6cf0752982f62ad136dc3bcf84 Mon Sep 17 00:00:00 2001 From: Eisuke Kawashima Date: Mon, 13 Jan 2025 21:15:22 +0900 Subject: [PATCH 040/102] [cross-project-tests] Fix invalid escape sequences (#94031) These generate a SyntaxWarning with Python 3.12. --- .../debuginfo-tests/dexter/dex/command/ParseCommand.py | 4 ++-- cross-project-tests/lit.cfg.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py index 4b086e14d4050..4496fdf3cb0e8 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py @@ -128,7 +128,7 @@ def get_address_object(address_name: str, offset: int = 0): def _search_line_for_cmd_start(line: str, start: int, valid_commands: dict) -> int: - """Scan `line` for a string matching any key in `valid_commands`. + r"""Scan `line` for a string matching any key in `valid_commands`. Start searching from `start`. Commands escaped with `\` (E.g. `\DexLabel('a')`) are ignored. @@ -543,7 +543,7 @@ def test_parse_share_line(self): def test_parse_escaped(self): """Escaped commands are ignored.""" - lines = ['words \MockCmd("IGNORED") words words words\n'] + lines = ['words \\MockCmd("IGNORED") words words words\n'] values = self._find_all_mock_values_in_lines(lines) diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py index c2a8bcef26cbf..66fdd63632885 100644 --- a/cross-project-tests/lit.cfg.py +++ b/cross-project-tests/lit.cfg.py @@ -223,7 +223,7 @@ def can_target_host(): xcode_lldb_vers = subprocess.check_output(["xcrun", "lldb", "--version"]).decode( "utf-8" ) - match = re.search("lldb-(\d+)", xcode_lldb_vers) + match = re.search(r"lldb-(\d+)", xcode_lldb_vers) if match: apple_lldb_vers = int(match.group(1)) if apple_lldb_vers < 1000: @@ -247,7 +247,7 @@ def get_gdb_version_string(): if len(gdb_vers_lines) < 1: print("Unkown GDB version format (too few lines)", file=sys.stderr) return None - match = re.search("GNU gdb \(.*?\) ((\d|\.)+)", gdb_vers_lines[0].strip()) + match = re.search(r"GNU gdb \(.*?\) ((\d|\.)+)", gdb_vers_lines[0].strip()) if match is None: print(f"Unkown GDB version format: {gdb_vers_lines[0]}", file=sys.stderr) return None @@ -261,7 +261,7 @@ def get_clang_default_dwarf_version_string(triple): # Get the flags passed by the driver and look for -dwarf-version. cmd = f'{llvm_config.use_llvm_tool("clang")} -g -xc -c - -v -### --target={triple}' stderr = subprocess.run(cmd.split(), stderr=subprocess.PIPE).stderr.decode() - match = re.search("-dwarf-version=(\d+)", stderr) + match = re.search(r"-dwarf-version=(\d+)", stderr) if match is None: print("Cannot determine default dwarf version", file=sys.stderr) return None From 7fcfbd87c42b1695b60f4d3613a7811c412f1260 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Mon, 13 Jan 2025 17:52:30 +0530 Subject: [PATCH 041/102] [AMDGPU][NewPM] Port AMDGPUOpenCLEnqueuedBlockLowering to NPM (#122434) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 6 +-- .../AMDGPUOpenCLEnqueuedBlockLowering.cpp | 37 ++++++++++++++----- .../AMDGPUOpenCLEnqueuedBlockLowering.h | 23 ++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 ++-- llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll | 1 + 6 files changed, 60 insertions(+), 15 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 78667e628ec1e..400c5f219cc70 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -444,9 +444,9 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&); void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); -ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); -void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); -extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; +ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(); +void initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(PassRegistry &); +extern char &AMDGPUOpenCLEnqueuedBlockLoweringLegacyID; void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 4f5ca08b46c13..fbd15ad176e3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -31,6 +31,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPUOpenCLEnqueuedBlockLowering.h" #include "AMDGPU.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallString.h" @@ -48,11 +49,16 @@ using namespace llvm; namespace { /// Lower enqueued blocks. -class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { +class AMDGPUOpenCLEnqueuedBlockLowering { +public: + bool run(Module &M); +}; + +class AMDGPUOpenCLEnqueuedBlockLoweringLegacy : public ModulePass { public: static char ID; - explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} + explicit AMDGPUOpenCLEnqueuedBlockLoweringLegacy() : ModulePass(ID) {} private: bool runOnModule(Module &M) override; @@ -60,19 +66,32 @@ class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { } // end anonymous namespace -char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; +char AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID = 0; -char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = - AMDGPUOpenCLEnqueuedBlockLowering::ID; +char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringLegacyID = + AMDGPUOpenCLEnqueuedBlockLoweringLegacy::ID; -INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, +INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLoweringLegacy, DEBUG_TYPE, "Lower OpenCL enqueued blocks", false, false) -ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { - return new AMDGPUOpenCLEnqueuedBlockLowering(); +ModulePass *llvm::createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass() { + return new AMDGPUOpenCLEnqueuedBlockLoweringLegacy(); +} + +bool AMDGPUOpenCLEnqueuedBlockLoweringLegacy::runOnModule(Module &M) { + AMDGPUOpenCLEnqueuedBlockLowering Impl; + return Impl.run(M); +} + +PreservedAnalyses +AMDGPUOpenCLEnqueuedBlockLoweringPass::run(Module &M, ModuleAnalysisManager &) { + AMDGPUOpenCLEnqueuedBlockLowering Impl; + if (Impl.run(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); } -bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { +bool AMDGPUOpenCLEnqueuedBlockLowering::run(Module &M) { DenseSet Callers; auto &C = M.getContext(); bool Changed = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h new file mode 100644 index 0000000000000..16ed7c18d8523 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.h @@ -0,0 +1,23 @@ +//===- AMDGPUOpenCLEnqueuedBlockLowering.h -----------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { +class AMDGPUOpenCLEnqueuedBlockLoweringPass + : public PassInfoMixin { +public: + AMDGPUOpenCLEnqueuedBlockLoweringPass() = default; + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_OPENCLENQUEUEDBLOCKLOWERING_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index da594be992cb4..6f322074ba74c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -21,6 +21,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers", AMDGPULowerBufferFatPointersPass(*this)) MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass()) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) +MODULE_PASS("amdgpu-lower-enqueued-block", AMDGPUOpenCLEnqueuedBlockLoweringPass()) MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this)) MODULE_PASS("amdgpu-perf-hint", AMDGPUPerfHintAnalysisPass( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f8b60630bb7f6..6d4547dbc82c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,6 +22,7 @@ #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPUMacroFusion.h" +#include "AMDGPUOpenCLEnqueuedBlockLowering.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUSplitModule.h" @@ -501,7 +502,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); - initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); + initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPURegBankCombinerPass(*PR); @@ -1175,7 +1176,7 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createR600OpenCLImageTypeLoweringPass()); // Replace OpenCL enqueued block function pointers with global variables. - addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + addPass(createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass()); // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) @@ -1944,7 +1945,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUAlwaysInlinePass()); addPass(AlwaysInlinerPass()); - // TODO: Missing OpenCLEnqueuedBlockLowering + addPass(AMDGPUOpenCLEnqueuedBlockLoweringPass()); // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) diff --git a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll index 9391b50c04a5f..d7c8e47f98883 100644 --- a/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs ; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s +; RUN: opt -data-layout=A5 -mtriple=amdgcn -passes=amdgpu-lower-enqueued-block -S < %s | FileCheck %s %struct.ndrange_t = type { i32 } %opencl.queue_t = type opaque From 7cc3a34951c788e15e6583a7b6e926cbcede0c73 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Mon, 13 Jan 2025 12:31:29 +0000 Subject: [PATCH 042/102] [Flang][OpenMP] Support teams reductions lowering (#122683) This patch adds PFT to MLIR lowering of teams reductions. Since there is still no MLIR to LLVM IR translation implemented, compilation of programs including these constructs will still trigger not-yet-implemented errors. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 33 ++++++++++++++----- .../Lower/OpenMP/Todo/reduction-teams.f90 | 12 ------- flang/test/Lower/OpenMP/reduction-teams.f90 | 18 ++++++++++ 3 files changed, 42 insertions(+), 21 deletions(-) delete mode 100644 flang/test/Lower/OpenMP/Todo/reduction-teams.f90 create mode 100644 flang/test/Lower/OpenMP/reduction-teams.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index c71fd598d5c8a..8a1029426d30c 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1336,19 +1336,18 @@ static void genWorkshareClauses(lower::AbstractConverter &converter, cp.processNowait(clauseOps); } -static void genTeamsClauses(lower::AbstractConverter &converter, - semantics::SemanticsContext &semaCtx, - lower::StatementContext &stmtCtx, - const List &clauses, mlir::Location loc, - mlir::omp::TeamsOperands &clauseOps) { +static void genTeamsClauses( + lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, + lower::StatementContext &stmtCtx, const List &clauses, + mlir::Location loc, mlir::omp::TeamsOperands &clauseOps, + llvm::SmallVectorImpl &reductionSyms) { ClauseProcessor cp(converter, semaCtx, clauses); cp.processAllocate(clauseOps); cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps); cp.processNumTeams(stmtCtx, clauseOps); cp.processThreadLimit(stmtCtx, clauseOps); + cp.processReduction(loc, clauseOps, reductionSyms); // TODO Support delayed privatization. - - cp.processTODO(loc, llvm::omp::Directive::OMPD_teams); } static void genWsloopClauses( @@ -2015,13 +2014,29 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item) { lower::StatementContext stmtCtx; + mlir::omp::TeamsOperands clauseOps; - genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); + llvm::SmallVector reductionSyms; + genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps, + reductionSyms); + + EntryBlockArgs args; + // TODO: Add private syms and vars. + args.reduction.syms = reductionSyms; + args.reduction.vars = clauseOps.reductionVars; + + auto genRegionEntryCB = [&](mlir::Operation *op) { + genEntryBlock(converter.getFirOpBuilder(), args, op->getRegion(0)); + bindEntryBlockArgs( + converter, llvm::cast(op), args); + return llvm::to_vector(args.getSyms()); + }; return genOpWithBody( OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval, llvm::omp::Directive::OMPD_teams) - .setClauses(&item->clauses), + .setClauses(&item->clauses) + .setGenRegionEntryCb(genRegionEntryCB), queue, item, clauseOps); } diff --git a/flang/test/Lower/OpenMP/Todo/reduction-teams.f90 b/flang/test/Lower/OpenMP/Todo/reduction-teams.f90 deleted file mode 100644 index db4839593c7e7..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/reduction-teams.f90 +++ /dev/null @@ -1,12 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s -! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s - -! CHECK: not yet implemented: Unhandled clause REDUCTION in TEAMS construct -subroutine reduction_teams() - integer :: i - i = 0 - - !$omp teams reduction(+:i) - i = i + 1 - !$omp end teams -end subroutine reduction_teams diff --git a/flang/test/Lower/OpenMP/reduction-teams.f90 b/flang/test/Lower/OpenMP/reduction-teams.f90 new file mode 100644 index 0000000000000..6997e774c2d42 --- /dev/null +++ b/flang/test/Lower/OpenMP/reduction-teams.f90 @@ -0,0 +1,18 @@ +! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: omp.declare_reduction @[[RED:.*]] : i32 init { + +! CHECK: func.func @_QPreduction_teams() { +subroutine reduction_teams() + integer :: i + i = 0 + + ! CHECK: omp.teams reduction(@[[RED]] %{{.*}}#0 -> %[[PRIV_I:.*]] : !fir.ref) { + !$omp teams reduction(+:i) + ! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[PRIV_I]] + ! CHECK: %{{.*}} = fir.load %[[DECL_I]]#0 : !fir.ref + ! CHECK: hlfir.assign %{{.*}} to %[[DECL_I]]#0 : i32, !fir.ref + i = i + 1 + !$omp end teams +end subroutine reduction_teams From 67cdf2a138889c6e7837822aff995659b1751bbb Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 13 Jan 2025 19:35:56 +0700 Subject: [PATCH 043/102] AMDGPU: Add gfx9 run line to scalar_to_vector test (#122659) --- llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 104 +++++++++++++++++-- 1 file changed, 93 insertions(+), 11 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index e8f86a6ce63ff..949e6f38e9b42 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX89,VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -| FileCheck %s --check-prefixes=GFX89,GFX9 ; XXX - Why the packing? define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { @@ -43,6 +44,27 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm %tmp1 = load i32, ptr addrspace(1) %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -90,6 +112,27 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm %tmp1 = load float, ptr addrspace(1) %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> @@ -130,6 +173,23 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() { ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v4i16: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_lshl_b32 s1, s0, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm bb: %tmp = load <2 x i8>, ptr addrspace(1) undef, align 1 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> @@ -176,6 +236,28 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() { ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX9-LABEL: scalar_to_vector_v4f16: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_lshl_b32 s1, s0, 8 +; GFX9-NEXT: s_or_b32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s1, s0, 0xff00 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NEXT: s_or_b32 s1, s4, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s4, s1, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s1 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm bb: %load = load half, ptr addrspace(1) undef, align 1 %tmp = bitcast half %load to <2 x i8> @@ -235,16 +317,16 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: scalar_to_vector_test6: -; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: s_endpgm +; GFX89-LABEL: scalar_to_vector_test6: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 %bc = bitcast <4 x i8> %newvec0 to <2 x half> store <2 x half> %bc, ptr addrspace(1) %out From 9d5264b2e616c628e0bb752d7223b6dce83ee23a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 13 Jan 2025 19:38:58 +0700 Subject: [PATCH 044/102] DAG: Fold bitcast of scalar_to_vector to anyext (#122660) scalar_to_vector is difficult to make appear and test, but I found one case where this makes an observable difference. It fires more often than this in the test suite, but most of them have no net result in the final code. This helps reduce regressions in a future commit. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +++++ llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 30 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index da3c834417d6b..02b79c67af3ee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16012,6 +16012,14 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) return CombineLD; + // int_vt (bitcast (vec_vt (scalar_to_vector elt_vt:x))) + // => int_vt (any_extend elt_vt:x) + if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isScalarInteger()) { + SDValue SrcScalar = N0.getOperand(0); + if (SrcScalar.getValueType().isScalarInteger()) + return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SrcScalar); + } + // Remove double bitcasts from shuffles - this is often a legacy of // XformToShuffleWithZero being used to combine bitmaskings (of // float vectors bitcast to integer vectors) into shuffles. diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 949e6f38e9b42..e14666cdac5c2 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -332,3 +332,33 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero store <2 x half> %bc, ptr addrspace(1) %out ret void } + +; bitcast (scalar_to_vector x) -> any_extend x +define i64 @bitcast_combine_scalar_to_vector_v4i16(i16 %arg) { +; SI-LABEL: bitcast_combine_scalar_to_vector_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; SI-NEXT: v_or_b32_e32 v2, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: bitcast_combine_scalar_to_vector_v4i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 +; GFX89-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX89-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX89-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: s_setpc_b64 s[30:31] + %arg.cast = bitcast i16 %arg to <2 x i8> + %tmp1 = shufflevector <2 x i8> %arg.cast, <2 x i8> poison, <8 x i32> + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> poison, <8 x i32> + %cast = bitcast <8 x i8> %tmp2 to i64 + ret i64 %cast +} From 9a9b71c5967186216e8e0460dce755dcbe3870a7 Mon Sep 17 00:00:00 2001 From: Maksim Ivanov Date: Mon, 13 Jan 2025 13:42:22 +0100 Subject: [PATCH 045/102] [clang] Refactor attr diagnostics to use %select (#122473) A cleanup follow-up to #118501 and #118567. --- clang/examples/Attribute/Attribute.cpp | 9 +++++---- .../CallSuperAttribute/CallSuperAttrInfo.cpp | 5 +++-- clang/include/clang/Basic/DiagnosticSemaKinds.td | 9 ++++++++- clang/include/clang/Sema/ParsedAttr.h | 7 +++++++ clang/lib/Parse/ParseDecl.cpp | 5 +++-- clang/lib/Sema/SemaDeclAttr.cpp | 12 ++++++------ clang/lib/Sema/SemaSwift.cpp | 4 ++-- clang/lib/Sema/SemaType.cpp | 9 +++++---- 8 files changed, 39 insertions(+), 21 deletions(-) diff --git a/clang/examples/Attribute/Attribute.cpp b/clang/examples/Attribute/Attribute.cpp index 3b90724ad2220..625f1645afbff 100644 --- a/clang/examples/Attribute/Attribute.cpp +++ b/clang/examples/Attribute/Attribute.cpp @@ -42,8 +42,8 @@ struct ExampleAttrInfo : public ParsedAttrInfo { const Decl *D) const override { // This attribute appertains to functions only. if (!isa(D)) { - S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << Attr << Attr.isRegularKeywordAttribute() << "functions"; + S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << Attr << Attr.isRegularKeywordAttribute() << ExpectedFunction; return false; } return true; @@ -99,8 +99,9 @@ struct ExampleAttrInfo : public ParsedAttrInfo { const Stmt *St) const override { // This attribute appertains to for loop statements only. if (!isa(St)) { - S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << Attr << Attr.isRegularKeywordAttribute() << "for loop statements"; + S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << Attr << Attr.isRegularKeywordAttribute() + << ExpectedForLoopStatement; return false; } return true; diff --git a/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp b/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp index 12d4c311586e6..f206a84ab1311 100644 --- a/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp +++ b/clang/examples/CallSuperAttribute/CallSuperAttrInfo.cpp @@ -168,8 +168,9 @@ struct CallSuperAttrInfo : public ParsedAttrInfo { const Decl *D) const override { const auto *TheMethod = dyn_cast_or_null(D); if (!TheMethod || !TheMethod->isVirtual()) { - S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << Attr << Attr.isRegularKeywordAttribute() << "virtual functions"; + S.Diag(Attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << Attr << Attr.isRegularKeywordAttribute() + << ExpectedVirtualFunction; return false; } MarkedMethods.insert(TheMethod); diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f04381a32a415..8be4f946dce1c 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3799,7 +3799,14 @@ def warn_attribute_wrong_decl_type : Warning< "|types and namespaces" "|variables, functions and classes" "|kernel functions" - "|non-K&R-style functions}2">, + "|non-K&R-style functions" + "|for loop statements" + "|virtual functions" + "|parameters and implicit object parameters" + "|non-member functions" + "|functions, classes, or enumerations" + "|classes" + "|typedefs}2">, InGroup; def err_attribute_wrong_decl_type : Error; def warn_type_attribute_wrong_type : Warning< diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index 4fa5fbdb5a7f6..e1faab205f647 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -1099,6 +1099,13 @@ enum AttributeDeclKind { ExpectedFunctionVariableOrClass, ExpectedKernelFunction, ExpectedFunctionWithProtoType, + ExpectedForLoopStatement, + ExpectedVirtualFunction, + ExpectedParameterOrImplicitObjectParameter, + ExpectedNonMemberFunction, + ExpectedFunctionOrClassOrEnum, + ExpectedClass, + ExpectedTypedef, }; inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 7f3f6d568e28c..f136d5007e8a5 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -24,6 +24,7 @@ #include "clang/Parse/RAIIObjectsForParser.h" #include "clang/Sema/EnterExpressionEvaluationContext.h" #include "clang/Sema/Lookup.h" +#include "clang/Sema/ParsedAttr.h" #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/SemaCUDA.h" @@ -3708,9 +3709,9 @@ void Parser::ParseDeclarationSpecifiers( continue; if (PA.getKind() == ParsedAttr::AT_LifetimeBound) - Diag(PA.getLoc(), diag::err_attribute_wrong_decl_type_str) + Diag(PA.getLoc(), diag::err_attribute_wrong_decl_type) << PA << PA.isRegularKeywordAttribute() - << "parameters and implicit object parameters"; + << ExpectedParameterOrImplicitObjectParameter; else Diag(PA.getLoc(), diag::err_attribute_not_type_attr) << PA << PA.isRegularKeywordAttribute(); diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index bb4d33560b93b..c1663f2d15c88 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -1868,8 +1868,8 @@ static void handleNakedAttr(Sema &S, Decl *D, const ParsedAttr &AL) { // This form is not allowed to be written on a member function (static or // nonstatic) when in Microsoft compatibility mode. if (S.getLangOpts().MSVCCompat && isa(D)) { - S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type_str) - << AL << AL.isRegularKeywordAttribute() << "non-member functions"; + S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type) + << AL << AL.isRegularKeywordAttribute() << ExpectedNonMemberFunction; return; } } @@ -2761,9 +2761,9 @@ static void handleWarnUnusedResult(Sema &S, Decl *D, const ParsedAttr &AL) { // The standard attribute cannot be applied to variable declarations such // as a function pointer. if (isa(D)) - S.Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type_str) + S.Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type) << AL << AL.isRegularKeywordAttribute() - << "functions, classes, or enumerations"; + << ExpectedFunctionOrClassOrEnum; // If this is spelled as the standard C++17 attribute, but not in C++17, // warn about using it as an extension. If there are attribute arguments, @@ -5555,8 +5555,8 @@ static void handleNullableTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (auto *CRD = dyn_cast(D); !CRD || !(CRD->isClass() || CRD->isStruct())) { - S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type_str) - << AL << AL.isRegularKeywordAttribute() << "classes"; + S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type) + << AL << AL.isRegularKeywordAttribute() << ExpectedClass; return; } diff --git a/clang/lib/Sema/SemaSwift.cpp b/clang/lib/Sema/SemaSwift.cpp index 24fdfb8e57dc3..fe72d6c85c37a 100644 --- a/clang/lib/Sema/SemaSwift.cpp +++ b/clang/lib/Sema/SemaSwift.cpp @@ -650,8 +650,8 @@ void SemaSwift::handleNewType(Decl *D, const ParsedAttr &AL) { } if (!isa(D)) { - Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type_str) - << AL << AL.isRegularKeywordAttribute() << "typedefs"; + Diag(AL.getLoc(), diag::warn_attribute_wrong_decl_type) + << AL << AL.isRegularKeywordAttribute() << ExpectedTypedef; return; } diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index e3ec327c1b364..2ccf5a8e1d6f3 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -7983,8 +7983,9 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, if (!FnTy) { // SME ACLE attributes are not supported on K&R-style unprototyped C // functions. - S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type) << - attr << attr.isRegularKeywordAttribute() << ExpectedFunctionWithProtoType; + S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type) + << attr << attr.isRegularKeywordAttribute() + << ExpectedFunctionWithProtoType; attr.setInvalid(); return false; } @@ -8676,9 +8677,9 @@ static void HandleLifetimeBoundAttr(TypeProcessingState &State, CurType, CurType); return; } - State.getSema().Diag(Attr.getLoc(), diag::err_attribute_wrong_decl_type_str) + State.getSema().Diag(Attr.getLoc(), diag::err_attribute_wrong_decl_type) << Attr << Attr.isRegularKeywordAttribute() - << "parameters and implicit object parameters"; + << ExpectedParameterOrImplicitObjectParameter; } static void HandleLifetimeCaptureByAttr(TypeProcessingState &State, From 462d407198cce8e97eb1cb5c7b546cb868bf4dc6 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 13 Jan 2025 20:58:38 +0800 Subject: [PATCH 046/102] [LV] Fix FindLastIV reduction for epilogue vectorization. (#120395) Following 0e528ac404e13ed2d952a2d83aaf8383293c851e, this patch adjusts the resume value of VPReductionPHIRecipe for FindLastIV reductions. Replacing the resume value with: ResumeValue = ResumeValue == StartValue ? SentinelValue : ResumeValue; This addressed the correctness issue when the start value might not be less than the minimum value of a monotonically increasing induction variable. Thanks Florian Hahn for the help. --------- Co-authored-by: Florian Hahn --- .../Transforms/Vectorize/LoopVectorize.cpp | 27 +++++++++++++++++++ .../LoopVectorize/epilog-iv-select-cmp.ll | 8 ++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0a13ce902795e..ee352c0b12302 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7691,6 +7691,20 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( "AnyOf expected to start by comparing main resume value to original " "start value"); MainResumeValue = Cmp->getOperand(0); + } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + using namespace llvm::PatternMatch; + Value *Cmp, *OrigResumeV; + bool IsExpectedPattern = + match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), + m_Specific(RdxDesc.getSentinelValue()), + m_Value(OrigResumeV))) && + match(Cmp, + m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), + m_Specific(RdxDesc.getRecurrenceStartValue()))); + assert(IsExpectedPattern && "Unexpected reduction resume pattern"); + (void)IsExpectedPattern; + MainResumeValue = OrigResumeV; } PHINode *MainResumePhi = cast(MainResumeValue); @@ -10413,6 +10427,19 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, cast(ResumeV)->getParent()->getFirstNonPHI()); ResumeV = Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); + } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment + // to the resume value. The resume value is adjusted to the sentinel + // value when the final value from the main vector loop equals the start + // value. This ensures correctness when the start value might not be + // less than the minimum value of a monotonically increasing induction + // variable. + IRBuilder<> Builder( + cast(ResumeV)->getParent()->getFirstNonPHI()); + Value *Cmp = + Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); + ResumeV = + Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); } } else { // Retrieve the induction resume values for wide inductions from diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 052b4a10e9c8d..06f0f05889116 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -40,7 +40,9 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 3 +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 @@ -144,7 +146,9 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[BC_MERGE_RDX1]], 2 +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = select i1 [[TMP14]], i64 -9223372036854775808, i64 [[BC_MERGE_RDX1]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 From 47139172061f8bd564eb0613dfc8d1eb7b116742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Mon, 13 Jan 2025 14:04:28 +0100 Subject: [PATCH 047/102] [NFC][analyzer][docs] Migrate 'annotations.html' to RST (#122246) This commit migrates the contents of 'annotations.html' in the old HTML-based documentation of the Clang static analyzer to the new RST-based documentation. During this conversion I reordered the sections of this documentation file by placing the section "Custom Assertion Handlers" as a subsection of "Annotations to Enhance Generic Checks". (The primary motivation was that Sphinx complained about inconsistent section levels; with this change I preserved that sections describing individual annotations are all on the same level.) Apart from this change and the format conversion, I didn't review, validate or edit the contents of this documentation file because I think it would be better to place any additional changes in separate commits. --- clang/docs/LanguageExtensions.rst | 8 +- clang/docs/UsersManual.rst | 6 +- .../images/example_attribute_nonnull.png | Bin .../images/example_cf_returns_retained.png | Bin .../images/example_ns_returns_retained.png | Bin clang/docs/analyzer/user-docs.rst | 1 + clang/docs/analyzer/user-docs/Annotations.rst | 689 ++++++++++++++++ clang/docs/analyzer/user-docs/FAQ.rst | 6 +- clang/include/clang/Basic/AttrDocs.td | 2 +- clang/www/analyzer/annotations.html | 766 +----------------- 10 files changed, 708 insertions(+), 770 deletions(-) rename clang/{www => docs}/analyzer/images/example_attribute_nonnull.png (100%) rename clang/{www => docs}/analyzer/images/example_cf_returns_retained.png (100%) rename clang/{www => docs}/analyzer/images/example_ns_returns_retained.png (100%) create mode 100644 clang/docs/analyzer/user-docs/Annotations.rst diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index e020710c7aa4f..2eb0777dbdc6c 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -2137,8 +2137,8 @@ method; it specifies that the method expects its ``self`` parameter to have a - (void) bar __attribute__((ns_consumes_self)); - (void) baz:(id) __attribute__((ns_consumed)) x; -Further examples of these attributes are available in the static analyzer's `list of annotations for analysis -`_. +Further examples of these attributes are available in the static analyzer's +`list of annotations for analysis `__. Query for these features with ``__has_attribute(ns_consumed)``, ``__has_attribute(ns_returns_retained)``, etc. @@ -4792,8 +4792,8 @@ Extensions for Static Analysis Clang supports additional attributes that are useful for documenting program invariants and rules for static analysis tools, such as the `Clang Static Analyzer `_. These attributes are documented -in the analyzer's `list of source-level annotations -`_. +in the analyzer's `list of annotations for analysis +`__. Extensions for Dynamic Analysis diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 4de288250f3ad..260e84910c6f7 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -1364,10 +1364,8 @@ Controlling Static Analyzer Diagnostics While not strictly part of the compiler, the diagnostics from Clang's `static analyzer `_ can also be influenced by the user via changes to the source code. See the available -`annotations `_ and the -analyzer's `FAQ -page `_ for more -information. +`annotations `_ and the analyzer's +`FAQ page `_ for more information. .. _usersmanual-precompiled-headers: diff --git a/clang/www/analyzer/images/example_attribute_nonnull.png b/clang/docs/analyzer/images/example_attribute_nonnull.png similarity index 100% rename from clang/www/analyzer/images/example_attribute_nonnull.png rename to clang/docs/analyzer/images/example_attribute_nonnull.png diff --git a/clang/www/analyzer/images/example_cf_returns_retained.png b/clang/docs/analyzer/images/example_cf_returns_retained.png similarity index 100% rename from clang/www/analyzer/images/example_cf_returns_retained.png rename to clang/docs/analyzer/images/example_cf_returns_retained.png diff --git a/clang/www/analyzer/images/example_ns_returns_retained.png b/clang/docs/analyzer/images/example_ns_returns_retained.png similarity index 100% rename from clang/www/analyzer/images/example_ns_returns_retained.png rename to clang/docs/analyzer/images/example_ns_returns_retained.png diff --git a/clang/docs/analyzer/user-docs.rst b/clang/docs/analyzer/user-docs.rst index dd53ae143148c..e265f033a2c54 100644 --- a/clang/docs/analyzer/user-docs.rst +++ b/clang/docs/analyzer/user-docs.rst @@ -12,4 +12,5 @@ Contents: user-docs/FilingBugs user-docs/CrossTranslationUnit user-docs/TaintAnalysisConfiguration + user-docs/Annotations user-docs/FAQ diff --git a/clang/docs/analyzer/user-docs/Annotations.rst b/clang/docs/analyzer/user-docs/Annotations.rst new file mode 100644 index 0000000000000..d87e8f4df99c3 --- /dev/null +++ b/clang/docs/analyzer/user-docs/Annotations.rst @@ -0,0 +1,689 @@ +================== +Source Annotations +================== + +The Clang frontend supports several source-level annotations in the form of +`GCC-style attributes `_ +and pragmas that can help make using the Clang Static Analyzer more useful. +These annotations can both help suppress false positives as well as enhance the +analyzer's ability to find bugs. + +This page gives a practical overview of such annotations. For more technical +specifics regarding Clang-specific annotations please see the Clang's list of +`language extensions `_. +Details of "standard" GCC attributes (that Clang also supports) can +be found in the `GCC manual `_, with the +majority of the relevant attributes being in the section on +`function attributes `_. + +Note that attributes that are labeled **Clang-specific** are not +recognized by GCC. Their use can be conditioned using preprocessor macros +(examples included on this page). + +.. contents:: + :local: + +Annotations to Enhance Generic Checks +_____________________________________ + +Null Pointer Checking +##################### + +Attribute 'nonnull' +------------------- + +The analyzer recognizes the GCC attribute 'nonnull', which indicates that a +function expects that a given function parameter is not a null pointer. +Specific details of the syntax of using the 'nonnull' attribute can be found in +`GCC's documentation `_. + +Both the Clang compiler and GCC will flag warnings for simple cases where a +null pointer is directly being passed to a function with a 'nonnull' parameter +(e.g., as a constant). The analyzer extends this checking by using its deeper +symbolic analysis to track what pointer values are potentially null and then +flag warnings when they are passed in a function call via a 'nonnull' +parameter. + +**Example** + +.. code-block:: c + + int bar(int*p, int q, int *r) __attribute__((nonnull(1,3))); + + int foo(int *p, int *q) { + return !p ? bar(q, 2, p) + : bar(p, 2, q); + } + +Running ``scan-build`` over this source produces the following output: + +.. image:: ../images/example_attribute_nonnull.png + +.. _custom_assertion_handlers: + +Custom Assertion Handlers +######################### + +The analyzer exploits code assertions by pruning off paths where the +assertion condition is false. The idea is capture any program invariants +specified in the assertion that the developer may know but is not immediately +apparent in the code itself. In this way assertions make implicit assumptions +explicit in the code, which not only makes the analyzer more accurate when +finding bugs, but can help others better able to understand your code as well. +It can also help remove certain kinds of analyzer false positives by pruning off +false paths. + +In order to exploit assertions, however, the analyzer must understand when it +encounters an "assertion handler". Typically assertions are +implemented with a macro, with the macro performing a check for the assertion +condition and, when the check fails, calling an assertion handler. For +example, consider the following code fragment: + +.. code-block: c + + void foo(int *p) { + assert(p != NULL); + } + +When this code is preprocessed on Mac OS X it expands to the following: + +.. code-block: c + + void foo(int *p) { + (__builtin_expect(!(p != NULL), 0) ? __assert_rtn(__func__, "t.c", 4, "p != NULL") : (void)0); + } + +In this example, the assertion handler is ``__assert_rtn``. When called, +most assertion handlers typically print an error and terminate the program. The +analyzer can exploit such semantics by ending the analysis of a path once it +hits a call to an assertion handler. + +The trick, however, is that the analyzer needs to know that a called function +is an assertion handler; otherwise the analyzer might assume the function call +returns and it will continue analyzing the path where the assertion condition +failed. This can lead to false positives, as the assertion condition usually +implies a safety condition (e.g., a pointer is not null) prior to performing +some action that depends on that condition (e.g., dereferencing a pointer). + +The analyzer knows about several well-known assertion handlers, but can +automatically infer if a function should be treated as an assertion handler if +it is annotated with the 'noreturn' attribute or the (Clang-specific) +'analyzer_noreturn' attribute. Note that, currently, clang does not support +these attributes on Objective-C methods and C++ methods. + +Attribute 'noreturn' +-------------------- + +The 'noreturn' attribute is a GCC attribute that can be placed on the +declarations of functions. It means exactly what its name implies: a function +with a 'noreturn' attribute should never return. + +Specific details of the syntax of using the 'noreturn' attribute can be found +in `GCC's documentation `__. + +Not only does the analyzer exploit this information when pruning false paths, +but the compiler also takes it seriously and will generate different code (and +possibly better optimized) under the assumption that the function does not +return. + +**Example** + +On Mac OS X, the function prototype for ``__assert_rtn`` (declared in +``assert.h``) is specifically annotated with the 'noreturn' attribute: + +.. code-block: c + + void __assert_rtn(const char *, const char *, int, const char *) __attribute__((__noreturn__)); + +Attribute 'analyzer_noreturn' (Clang-specific) +---------------------------------------------- + +The Clang-specific 'analyzer_noreturn' attribute is almost identical to +'noreturn' except that it is ignored by the compiler for the purposes of code +generation. + +This attribute is useful for annotating assertion handlers that actually +*can* return, but for the purpose of using the analyzer we want to +pretend that such functions do not return. + +Because this attribute is Clang-specific, its use should be conditioned with +the use of preprocessor macros. + +**Example** + +.. code-block: c + + #ifndef CLANG_ANALYZER_NORETURN + #if __has_feature(attribute_analyzer_noreturn) + #define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) + #else + #define CLANG_ANALYZER_NORETURN + #endif + #endif + + void my_assert_rtn(const char *, const char *, int, const char *) CLANG_ANALYZER_NORETURN; + +Mac OS X API Annotations +________________________ + +.. _cocoa_mem: + +Cocoa & Core Foundation Memory Management Annotations +##################################################### + +The analyzer supports the proper management of retain counts for +both Cocoa and Core Foundation objects. This checking is largely based on +enforcing Cocoa and Core Foundation naming conventions for Objective-C methods +(Cocoa) and C functions (Core Foundation). Not strictly following these +conventions can cause the analyzer to miss bugs or flag false positives. + +One can educate the analyzer (and others who read your code) about methods or +functions that deviate from the Cocoa and Core Foundation conventions using the +attributes described here. However, you should consider using proper naming +conventions or the `objc_method_family `_ +attribute, if applicable. + +.. _ns_returns_retained: + +Attribute 'ns_returns_retained' (Clang-specific) +------------------------------------------------ + +The GCC-style (Clang-specific) attribute 'ns_returns_retained' allows one to +annotate an Objective-C method or C function as returning a retained Cocoa +object that the caller is responsible for releasing (via sending a +``release`` message to the object). The Foundation framework defines a +macro ``NS_RETURNS_RETAINED`` that is functionally equivalent to the +one shown below. + +**Placing on Objective-C methods**: For Objective-C methods, this +annotation essentially tells the analyzer to treat the method as if its name +begins with "alloc" or "new" or contains the word +"copy". + +**Placing on C functions**: For C functions returning Cocoa objects, the +analyzer typically does not make any assumptions about whether or not the object +is returned retained. Explicitly adding the 'ns_returns_retained' attribute to C +functions allows the analyzer to perform extra checking. + +**Example** + +.. code-block: objc + + #import ; + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_RETURNS_RETAINED + #if __has_feature(attribute_ns_returns_retained) + #define NS_RETURNS_RETAINED __attribute__((ns_returns_retained)) + #else + #define NS_RETURNS_RETAINED + #endif + #endif + + @interface MyClass : NSObject {} + - (NSString*) returnsRetained NS_RETURNS_RETAINED; + - (NSString*) alsoReturnsRetained; + @end + + @implementation MyClass + - (NSString*) returnsRetained { + return [[NSString alloc] initWithCString:"no leak here"]; + } + - (NSString*) alsoReturnsRetained { + return [[NSString alloc] initWithCString:"flag a leak"]; + } + @end + +Running ``scan-build`` on this source file produces the following output: + +.. image:: ../images/example_ns_returns_retained.png + +.. _ns_returns_not_retained: + +Attribute 'ns_returns_not_retained' (Clang-specific) +---------------------------------------------------- + +The 'ns_returns_not_retained' attribute is the complement of +'`ns_returns_retained`_'. Where a function or method may appear to obey the +Cocoa conventions and return a retained Cocoa object, this attribute can be +used to indicate that the object reference returned should not be considered as +an "owning" reference being returned to the caller. The Foundation +framework defines a macro ``NS_RETURNS_NOT_RETAINED`` that is functionally +equivalent to the one shown below. + +Usage is identical to `ns_returns_retained`_. When using the +attribute, be sure to declare it within the proper macro that checks for +its availability, as it is not available in earlier versions of the analyzer: + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_RETURNS_NOT_RETAINED + #if __has_feature(attribute_ns_returns_not_retained) + #define NS_RETURNS_NOT_RETAINED __attribute__((ns_returns_not_retained)) + #else + #define NS_RETURNS_NOT_RETAINED + #endif + #endif + +.. _cf_returns_retained: + +Attribute 'cf_returns_retained' (Clang-specific) +------------------------------------------------ + +The GCC-style (Clang-specific) attribute 'cf_returns_retained' allows one to +annotate an Objective-C method or C function as returning a retained Core +Foundation object that the caller is responsible for releasing. The +CoreFoundation framework defines a macro ``CF_RETURNS_RETAINED`` that is +functionally equivalent to the one shown below. + +**Placing on Objective-C methods**: With respect to Objective-C methods., +this attribute is identical in its behavior and usage to 'ns_returns_retained' +except for the distinction of returning a Core Foundation object instead of a +Cocoa object. + +This distinction is important for the following reason: as Core Foundation is a +C API, the analyzer cannot always tell that a pointer return value refers to a +Core Foundation object. In contrast, it is trivial for the analyzer to +recognize if a pointer refers to a Cocoa object (given the Objective-C type +system). + +**Placing on C functions**: When placing the attribute +'cf_returns_retained' on the declarations of C functions, the analyzer +interprets the function as: + +1. Returning a Core Foundation Object +2. Treating the function as if it its name contained the keywords + "create" or "copy". This means the returned object as a + +1 retain count that must be released by the caller, either by sending a + ``release`` message (via toll-free bridging to an Objective-C object + pointer), or calling ``CFRelease`` or a similar function. + +**Example** + +.. code-block:objc + + #import + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef CF_RETURNS_RETAINED + #if __has_feature(attribute_cf_returns_retained) + #define CF_RETURNS_RETAINED __attribute__((cf_returns_retained)) + #else + #define CF_RETURNS_RETAINED + #endif + #endif + + @interface MyClass : NSObject {} + - (NSDate*) returnsCFRetained CF_RETURNS_RETAINED; + - (NSDate*) alsoReturnsRetained; + - (NSDate*) returnsNSRetained NS_RETURNS_RETAINED; + @end + + CF_RETURNS_RETAINED + CFDateRef returnsRetainedCFDate() { + return CFDateCreate(0, CFAbsoluteTimeGetCurrent()); + } + + @implementation MyClass + - (NSDate*) returnsCFRetained { + return (NSDate*) returnsRetainedCFDate(); // No leak. + } + + - (NSDate*) alsoReturnsRetained { + return (NSDate*) returnsRetainedCFDate(); // Always report a leak. + } + + - (NSDate*) returnsNSRetained { + return (NSDate*) returnsRetainedCFDate(); // Report a leak when using GC. + } + @end + +Running ``scan-build`` on this example produces the following output: + +.. image:: ../images/example_cf_returns_retained.png + +Attribute 'cf_returns_not_retained' (Clang-specific) +---------------------------------------------------- + +The 'cf_returns_not_retained' attribute is the complement of +'`cf_returns_retained`_'. Where a function or method may appear to obey the +Core Foundation or Cocoa conventions and return a retained Core Foundation +object, this attribute can be used to indicate that the object reference +returned should not be considered as an "owning" reference being +returned to the caller. The CoreFoundation framework defines a macro +**``CF_RETURNS_NOT_RETAINED``** that is functionally equivalent to the one +shown below. + +Usage is identical to cf_returns_retained_. When using the attribute, be sure +to declare it within the proper macro that checks for its availability, as it +is not available in earlier versions of the analyzer: + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef CF_RETURNS_NOT_RETAINED + #if __has_feature(attribute_cf_returns_not_retained) + #define CF_RETURNS_NOT_RETAINED __attribute__((cf_returns_not_retained)) + #else + #define CF_RETURNS_NOT_RETAINED + #endif + #endif + +.. _ns_consumed: + +Attribute 'ns_consumed' (Clang-specific) +---------------------------------------- + +The 'ns_consumed' attribute can be placed on a specific parameter in either +the declaration of a function or an Objective-C method. It indicates to the +static analyzer that a ``release`` message is implicitly sent to the +parameter upon completion of the call to the given function or method. The +Foundation framework defines a macro ``NS_RELEASES_ARGUMENT`` that +is functionally equivalent to the ``NS_CONSUMED`` macro shown below. + +**Example** + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_CONSUMED + #if __has_feature(attribute_ns_consumed) + #define NS_CONSUMED __attribute__((ns_consumed)) + #else + #define NS_CONSUMED + #endif + #endif + + void consume_ns(id NS_CONSUMED x); + + void test() { + id x = [[NSObject alloc] init]; + consume_ns(x); // No leak! + } + + @interface Foo : NSObject + + (void) releaseArg:(id) NS_CONSUMED x; + + (void) releaseSecondArg:(id)x second:(id) NS_CONSUMED y; + @end + + void test_method() { + id x = [[NSObject alloc] init]; + [Foo releaseArg:x]; // No leak! + } + + void test_method2() { + id a = [[NSObject alloc] init]; + id b = [[NSObject alloc] init]; + [Foo releaseSecondArg:a second:b]; // 'a' is leaked, but 'b' is released. + } + +Attribute 'cf_consumed' (Clang-specific) +---------------------------------------- + +The 'cf_consumed' attribute is practically identical to ns_consumed_. The +attribute can be placed on a specific parameter in either the declaration of a +function or an Objective-C method. It indicates to the static analyzer that the +object reference is implicitly passed to a call to ``CFRelease`` upon +completion of the call to the given function or method. The CoreFoundation +framework defines a macro ``CF_RELEASES_ARGUMENT`` that is functionally +equivalent to the ``CF_CONSUMED`` macro shown below. + +Operationally this attribute is nearly identical to 'ns_consumed'. + +**Example** + +.. code-block:objc + + #ifndef __has_feature // Optional. + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef CF_CONSUMED + #if __has_feature(attribute_cf_consumed) + #define CF_CONSUMED __attribute__((cf_consumed)) + #else + #define CF_CONSUMED + #endif + #endif + + void consume_cf(id CF_CONSUMED x); + void consume_CFDate(CFDateRef CF_CONSUMED x); + + void test() { + id x = [[NSObject alloc] init]; + consume_cf(x); // No leak! + } + + void test2() { + CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); + consume_CFDate(date); // No leak, including under GC! + + } + + @interface Foo : NSObject + + (void) releaseArg:(CFDateRef) CF_CONSUMED x; + @end + + void test_method() { + CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); + [Foo releaseArg:date]; // No leak! + } + +.. _ns_consumes_self: + +Attribute 'ns_consumes_self' (Clang-specific) +--------------------------------------------- + +The 'ns_consumes_self' attribute can be placed only on an Objective-C method +declaration. It indicates that the receiver of the message is +"consumed" (a single reference count decremented) after the message +is sent. This matches the semantics of all "init" methods. + +One use of this attribute is declare your own init-like methods that do not +follow the standard Cocoa naming conventions. + +**Example** + +.. code-block:objc + #ifndef __has_feature + #define __has_feature(x) 0 // Compatibility with non-clang compilers. + #endif + + #ifndef NS_CONSUMES_SELF + #if __has_feature((attribute_ns_consumes_self)) + #define NS_CONSUMES_SELF __attribute__((ns_consumes_self)) + #else + #define NS_CONSUMES_SELF + #endif + #endif + + @interface MyClass : NSObject + - initWith:(MyClass *)x; + - nonstandardInitWith:(MyClass *)x NS_CONSUMES_SELF NS_RETURNS_RETAINED; + @end + +In this example, ``-nonstandardInitWith:`` has the same ownership +semantics as the init method ``-initWith:``. The static analyzer will +observe that the method consumes the receiver, and then returns an object with +a +1 retain count. + +The Foundation framework defines a macro ``NS_REPLACES_RECEIVER`` which is +functionally equivalent to the combination of ``NS_CONSUMES_SELF`` and +``NS_RETURNS_RETAINED`` shown above. + +Libkern Memory Management Annotations +##################################### + +`Libkern `_ +requires developers to inherit all heap allocated objects from ``OSObject`` and +to perform manual reference counting. The reference counting model is very +similar to MRR (manual retain-release) mode in +`Objective-C `_ +or to CoreFoundation reference counting. +Freshly-allocated objects start with a reference count of 1, and calls to +``retain`` increment it, while calls to ``release`` decrement it. The object is +deallocated whenever its reference count reaches zero. + +Manually incrementing and decrementing reference counts is error-prone: +over-retains lead to leaks, and over-releases lead to uses-after-free. +The analyzer can help the programmer to check for unbalanced +retain/release calls. + +The reference count checking is based on the principle of *locality*: it should +be possible to establish correctness (lack of leaks/uses after free) by looking +at each function body, and the declarations (not the definitions) of all the +functions it interacts with. + +In order to support such reasoning, it should be possible to *summarize* the +behavior of each function, with respect to reference count of its returned +values and attributes. + +By default, the following summaries are assumed: + +- All functions starting with ``get`` or ``Get``, unless they are returning + subclasses of ``OSIterator``, are assumed to be returning at +0. That is, the + caller has no reference count *obligations* with respect to the reference + count of the returned object and should leave it untouched. + +- All other functions are assumed to return at +1. That is, the caller has an + *obligation* to release such objects. + +- Functions are assumed not to change the reference count of their parameters, + including the implicit ``this`` parameter. + +These summaries can be overriden with the following +`attributes `_: + +Attribute 'os_returns_retained' +------------------------------- + +The ``os_returns_retained`` attribute (accessed through the macro +``LIBKERN_RETURNS_RETAINED``) plays a role identical to `ns_returns_retained`_ +for functions returning ``OSObject`` subclasses. The attribute indicates that +it is a callers responsibility to release the returned object. + +Attribute 'os_returns_not_retained' +----------------------------------- + +The ``os_returns_not_retained`` attribute (accessed through the macro +``LIBKERN_RETURNS_NOT_RETAINED``) plays a role identical to +`ns_returns_not_retained`_ for functions returning ``OSObject`` subclasses. The +attribute indicates that the caller should not change the retain count of the +returned object. + + +**Example** + +.. code-block:objc + + class MyClass { + OSObject *f; + LIBKERN_RETURNS_NOT_RETAINED OSObject *myFieldGetter(); + } + + + // Note that the annotation only has to be applied to the function declaration. + OSObject * MyClass::myFieldGetter() { + return f; + } + +Attribute 'os_consumed' +----------------------- + +Similarly to `ns_consumed`_ attribute, ``os_consumed`` (accessed through +``LIBKERN_CONSUMED``) attribute, applied to a parameter, indicates that the +call to the function *consumes* the parameter: the callee should either release +it or store it and release it in the destructor, while the caller should assume +one is subtracted from the reference count after the call. + +.. code-block:objc + IOReturn addToList(LIBKERN_CONSUMED IOPMinformee *newInformee); + +Attribute 'os_consumes_this' +---------------------------- + +Similarly to `ns_consumes_self`_, the ``os_consumes_self`` attribute indicates +that the method call *consumes* the implicit ``this`` argument: the caller +should assume one was subtracted from the reference count of the object after +the call, and the callee has on obligation to either release the argument, or +store it and eventually release it in the destructor. + + +.. code-block:objc + void addThisToList(OSArray *givenList) LIBKERN_CONSUMES_THIS; + +Out Parameters +-------------- + +A function can also return an object to a caller by a means of an out parameter +(a pointer-to-OSObject-pointer is passed, and a callee writes a pointer to an +object into an argument). Currently the analyzer does not track unannotated out +parameters by default, but with annotations we distinguish four separate cases: + +**1. Non-retained out parameters**, identified using +``LIBKERN_RETURNS_NOT_RETAINED`` applied to parameters, e.g.: + +.. code-block:objc + void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj) + +Such functions write a non-retained object into an out parameter, and the +caller has no further obligations. + +**2. Retained out parameters**, identified using ``LIBKERN_RETURNS_RETAINED``: + +.. code-block:objc + void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj) + +In such cases a retained object is written into an out parameter, which the caller has then to release in order to avoid a leak. + +These two cases are simple - but in practice a functions returning an +out-parameter usually also return a return code, and then an out parameter may +or may not be written, which conditionally depends on the exit code, e.g.: + +.. code-block:objc + bool maybeCreateObject(LIBKERN_RETURNS_RETAINED OSObject **obj); + +For such functions, the usual semantics is that an object is written into on "success", and not written into on "failure". + +For ``LIBKERN_RETURNS_RETAINED`` we assume the following definition of +success: + +- For functions returning ``OSReturn`` or ``IOReturn`` (any typedef to + ``kern_return_t``) success is defined as having an output of zero + (``kIOReturnSuccess`` is zero). + +- For all others, success is non-zero (e.g. non-nullptr for pointers) + +**3. Retained out parameters on zero return** The annotation +``LIBKERN_RETURNS_RETAINED_ON_ZERO`` states that a retained object is written +into if and only if the function returns a zero value: + +.. code-block:objc + bool OSUnserializeXML(void *data, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errString); + +Then the caller has to release an object if the function has returned zero. + +**4. Retained out parameters on non-zero return** Similarly, +``LIBKERN_RETURNS_RETAINED_ON_NONZERO`` specifies that a retained object is +written into the parameter if and only if the function has returned a non-zero +value. + +Note that for non-retained out parameters conditionals do not matter, as the +caller has no obligations regardless of whether an object is written into or +not. diff --git a/clang/docs/analyzer/user-docs/FAQ.rst b/clang/docs/analyzer/user-docs/FAQ.rst index af52e99c91d68..e1147916a767c 100644 --- a/clang/docs/analyzer/user-docs/FAQ.rst +++ b/clang/docs/analyzer/user-docs/FAQ.rst @@ -9,7 +9,7 @@ Custom Assertions Q: How do I tell the analyzer that I do not want the bug being reported here since my custom error handler will safely end the execution before the bug is reached? -You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers `_. For example, you can modify the code segment as following: +You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers `__. For example, you can modify the code segment as following: .. code-block:: c @@ -162,7 +162,7 @@ Suppressing Specific Warnings Q: How can I suppress a specific analyzer warning? -When you encounter an analyzer bug/false positive, check if it's one of the issues discussed above or if the analyzer `annotations `_ can resolve the issue by helping the static analyzer understand the code better. Second, please `report it `_ to help us improve user experience. +When you encounter an analyzer bug/false positive, check if it's one of the issues discussed above or if the analyzer `annotations `__ can resolve the issue by helping the static analyzer understand the code better. Second, please `report it `_ to help us improve user experience. Sometimes there's really no "good" way to eliminate the issue. In such cases you can "silence" it directly by annotating the problematic line of code with the help of Clang attribute 'suppress': @@ -192,6 +192,8 @@ Sometimes there's really no "good" way to eliminate the issue. In such cases you return *result; // as well as this leak path } +.. _exclude_code: + Excluding Code from Analysis ---------------------------- diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 953ff9a700e51..e10f24e239ece 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -1461,7 +1461,7 @@ Mind that many more checkers are affected by dynamic memory modeling changes to some extent. Further reading for other annotations: -`Source Annotations in the Clang Static Analyzer `_. +`Source Annotations in the Clang Static Analyzer `_. }]; } diff --git a/clang/www/analyzer/annotations.html b/clang/www/analyzer/annotations.html index bf0076e514278..b19d47bce2662 100644 --- a/clang/www/analyzer/annotations.html +++ b/clang/www/analyzer/annotations.html @@ -3,6 +3,8 @@ Source Annotations + + @@ -15,765 +17,11 @@

Source Annotations

+

This page is deprecated and will be removed in release 21.0

+

Its content was migrated to the regular LLVM documentation.

+ -

The Clang frontend supports several source-level annotations in the form of -GCC-style -attributes and pragmas that can help make using the Clang Static Analyzer -more useful. These annotations can both help suppress false positives as well as -enhance the analyzer's ability to find bugs.

- -

This page gives a practical overview of such annotations. For more technical -specifics regarding Clang-specific annotations please see the Clang's list of language -extensions. Details of "standard" GCC attributes (that Clang also -supports) can be found in the GCC -manual, with the majority of the relevant attributes being in the section on -function -attributes.

- -

Note that attributes that are labeled Clang-specific are not -recognized by GCC. Their use can be conditioned using preprocessor macros -(examples included on this page).

- -

Specific Topics

- - - - -

Annotations to Enhance Generic Checks

- - -

Null Pointer Checking

- -

Attribute 'nonnull'

- -

The analyzer recognizes the GCC attribute 'nonnull', which indicates that a -function expects that a given function parameter is not a null pointer. Specific -details of the syntax of using the 'nonnull' attribute can be found in GCC's -documentation.

- -

Both the Clang compiler and GCC will flag warnings for simple cases where a -null pointer is directly being passed to a function with a 'nonnull' parameter -(e.g., as a constant). The analyzer extends this checking by using its deeper -symbolic analysis to track what pointer values are potentially null and then -flag warnings when they are passed in a function call via a 'nonnull' -parameter.

- -

Example

- -
-$ cat test.m
-int bar(int*p, int q, int *r) __attribute__((nonnull(1,3)));
-
-int foo(int *p, int *q) {
-   return !p ? bar(q, 2, p)
-             : bar(p, 2, q);
-}
-
- -

Running scan-build over this source produces the following -output:

- -example attribute nonnull - - -

Mac OS X API Annotations

- - -

Cocoa & Core Foundation Memory Management -Annotations

- - -

The analyzer supports the proper management of retain counts for -both Cocoa and Core Foundation objects. This checking is largely based on -enforcing Cocoa and Core Foundation naming conventions for Objective-C methods -(Cocoa) and C functions (Core Foundation). Not strictly following these -conventions can cause the analyzer to miss bugs or flag false positives.

- -

One can educate the analyzer (and others who read your code) about methods or -functions that deviate from the Cocoa and Core Foundation conventions using the -attributes described here. However, you should consider using proper naming -conventions or the objc_method_family -attribute, if applicable.

- -

Attribute 'ns_returns_retained' -(Clang-specific)

- -

The GCC-style (Clang-specific) attribute 'ns_returns_retained' allows one to -annotate an Objective-C method or C function as returning a retained Cocoa -object that the caller is responsible for releasing (via sending a -release message to the object). The Foundation framework defines a -macro NS_RETURNS_RETAINED that is functionally equivalent to the -one shown below.

- -

Placing on Objective-C methods: For Objective-C methods, this -annotation essentially tells the analyzer to treat the method as if its name -begins with "alloc" or "new" or contains the word -"copy".

- -

Placing on C functions: For C functions returning Cocoa objects, the -analyzer typically does not make any assumptions about whether or not the object -is returned retained. Explicitly adding the 'ns_returns_retained' attribute to C -functions allows the analyzer to perform extra checking.

- -

Example

- -
-$ cat test.m
-#import <Foundation/Foundation.h>
-
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_RETURNS_RETAINED
-#if __has_feature(attribute_ns_returns_retained)
-#define NS_RETURNS_RETAINED __attribute__((ns_returns_retained))
-#else
-#define NS_RETURNS_RETAINED
-#endif
-#endif
-
-@interface MyClass : NSObject {}
-- (NSString*) returnsRetained NS_RETURNS_RETAINED;
-- (NSString*) alsoReturnsRetained;
-@end
-
-@implementation MyClass
-- (NSString*) returnsRetained {
-  return [[NSString alloc] initWithCString:"no leak here"];
-}
-- (NSString*) alsoReturnsRetained {
-  return [[NSString alloc] initWithCString:"flag a leak"];
-}
-@end
-
- -

Running scan-build on this source file produces the following output:

- -example returns retained - -

Attribute 'ns_returns_not_retained' -(Clang-specific)

- -

The 'ns_returns_not_retained' attribute is the complement of 'ns_returns_retained'. Where a function or -method may appear to obey the Cocoa conventions and return a retained Cocoa -object, this attribute can be used to indicate that the object reference -returned should not be considered as an "owning" reference being -returned to the caller. The Foundation framework defines a -macro NS_RETURNS_NOT_RETAINED that is functionally equivalent to -the one shown below.

- -

Usage is identical to ns_returns_retained. When using the -attribute, be sure to declare it within the proper macro that checks for -its availability, as it is not available in earlier versions of the analyzer:

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_RETURNS_NOT_RETAINED
-#if __has_feature(attribute_ns_returns_not_retained)
-#define NS_RETURNS_NOT_RETAINED __attribute__((ns_returns_not_retained))
-#else
-#define NS_RETURNS_NOT_RETAINED
-#endif
-#endif
-
- -

Attribute 'cf_returns_retained' -(Clang-specific)

- -

The GCC-style (Clang-specific) attribute 'cf_returns_retained' allows one to -annotate an Objective-C method or C function as returning a retained Core -Foundation object that the caller is responsible for releasing. The -CoreFoundation framework defines a macro CF_RETURNS_RETAINED -that is functionally equivalent to the one shown below.

- -

Placing on Objective-C methods: With respect to Objective-C methods., -this attribute is identical in its behavior and usage to 'ns_returns_retained' -except for the distinction of returning a Core Foundation object instead of a -Cocoa object. - -This distinction is important for the following reason: -as Core Foundation is a C API, -the analyzer cannot always tell that a pointer return value refers to a -Core Foundation object. -In contrast, it is -trivial for the analyzer to recognize if a pointer refers to a Cocoa object -(given the Objective-C type system). - -

Placing on C functions: When placing the attribute -'cf_returns_retained' on the declarations of C functions, the analyzer -interprets the function as:

- -
    -
  1. Returning a Core Foundation Object
  2. -
  3. Treating the function as if it its name -contained the keywords "create" or "copy". This means the -returned object as a +1 retain count that must be released by the caller, either -by sending a release message (via toll-free bridging to an Objective-C -object pointer), or calling CFRelease or a similar function.
  4. -
- -

Example

- -
-$ cat test.m
-$ cat test.m
-#import <Cocoa/Cocoa.h>
-
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_RETURNS_RETAINED
-#if __has_feature(attribute_cf_returns_retained)
-#define CF_RETURNS_RETAINED __attribute__((cf_returns_retained))
-#else
-#define CF_RETURNS_RETAINED
-#endif
-#endif
-
-@interface MyClass : NSObject {}
-- (NSDate*) returnsCFRetained CF_RETURNS_RETAINED;
-- (NSDate*) alsoReturnsRetained;
-- (NSDate*) returnsNSRetained NS_RETURNS_RETAINED;
-@end
-
-CF_RETURNS_RETAINED
-CFDateRef returnsRetainedCFDate()  {
-  return CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-}
-
-@implementation MyClass
-- (NSDate*) returnsCFRetained {
-  return (NSDate*) returnsRetainedCFDate(); // No leak.
-}
-
-- (NSDate*) alsoReturnsRetained {
-  return (NSDate*) returnsRetainedCFDate(); // Always report a leak.
-}
-
-- (NSDate*) returnsNSRetained {
-  return (NSDate*) returnsRetainedCFDate(); // Report a leak when using GC.
-}
-@end
-
- -

Running scan-build on this example produces the following output:

- -example returns retained - -

Attribute 'cf_returns_not_retained' -(Clang-specific)

- -

The 'cf_returns_not_retained' attribute is the complement of 'cf_returns_retained'. Where a function or -method may appear to obey the Core Foundation or Cocoa conventions and return -a retained Core Foundation object, this attribute can be used to indicate that -the object reference returned should not be considered as an -"owning" reference being returned to the caller. The -CoreFoundation framework defines a macro CF_RETURNS_NOT_RETAINED -that is functionally equivalent to the one shown below.

- -

Usage is identical to cf_returns_retained. When using the -attribute, be sure to declare it within the proper macro that checks for -its availability, as it is not available in earlier versions of the analyzer:

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_RETURNS_NOT_RETAINED
-#if __has_feature(attribute_cf_returns_not_retained)
-#define CF_RETURNS_NOT_RETAINED __attribute__((cf_returns_not_retained))
-#else
-#define CF_RETURNS_NOT_RETAINED
-#endif
-#endif
-
- -

Attribute 'ns_consumed' -(Clang-specific)

- -

The 'ns_consumed' attribute can be placed on a specific parameter in either -the declaration of a function or an Objective-C method. It indicates to the -static analyzer that a release message is implicitly sent to the -parameter upon completion of the call to the given function or method. The -Foundation framework defines a macro NS_RELEASES_ARGUMENT that -is functionally equivalent to the NS_CONSUMED macro shown below.

- -

Example

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_CONSUMED
-#if __has_feature(attribute_ns_consumed)
-#define NS_CONSUMED __attribute__((ns_consumed))
-#else
-#define NS_CONSUMED
-#endif
-#endif
-
-void consume_ns(id NS_CONSUMED x);
-
-void test() {
-  id x = [[NSObject alloc] init];
-  consume_ns(x); // No leak!
-}
-
-@interface Foo : NSObject
-+ (void) releaseArg:(id) NS_CONSUMED x;
-+ (void) releaseSecondArg:(id)x second:(id) NS_CONSUMED y;
-@end
-
-void test_method() {
-  id x = [[NSObject alloc] init];
-  [Foo releaseArg:x]; // No leak!
-}
-
-void test_method2() {
-  id a = [[NSObject alloc] init];
-  id b = [[NSObject alloc] init];
-  [Foo releaseSecondArg:a second:b]; // 'a' is leaked, but 'b' is released.
-}
-
- -

Attribute 'cf_consumed' -(Clang-specific)

- -

The 'cf_consumed' attribute is practically identical to ns_consumed. The attribute can be placed on a -specific parameter in either the declaration of a function or an Objective-C -method. It indicates to the static analyzer that the object reference is -implicitly passed to a call to CFRelease upon completion of the call -to the given function or method. The CoreFoundation framework defines a macro -CF_RELEASES_ARGUMENT that is functionally equivalent to the -CF_CONSUMED macro shown below.

- -

Operationally this attribute is nearly identical to 'ns_consumed'.

- -

Example

- -
-$ cat test.m
-#ifndef __has_feature      // Optional.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef CF_CONSUMED
-#if __has_feature(attribute_cf_consumed)
-#define CF_CONSUMED __attribute__((cf_consumed))
-#else
-#define CF_CONSUMED
-#endif
-#endif
-
-void consume_cf(id CF_CONSUMED x);
-void consume_CFDate(CFDateRef CF_CONSUMED x);
-
-void test() {
-  id x = [[NSObject alloc] init];
-  consume_cf(x); // No leak!
-}
-
-void test2() {
-  CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-  consume_CFDate(date); // No leak, including under GC!
-
-}
-
-@interface Foo : NSObject
-+ (void) releaseArg:(CFDateRef) CF_CONSUMED x;
-@end
-
-void test_method() {
-  CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());
-  [Foo releaseArg:date]; // No leak!
-}
-
- -

Attribute 'ns_consumes_self' -(Clang-specific)

- -

The 'ns_consumes_self' attribute can be placed only on an Objective-C method -declaration. It indicates that the receiver of the message is -"consumed" (a single reference count decremented) after the message -is sent. This matches the semantics of all "init" methods.

- -

One use of this attribute is declare your own init-like methods that do not -follow the standard Cocoa naming conventions.

- -

Example

- -
-#ifndef __has_feature
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#ifndef NS_CONSUMES_SELF
-#if __has_feature((attribute_ns_consumes_self))
-#define NS_CONSUMES_SELF __attribute__((ns_consumes_self))
-#else
-#define NS_CONSUMES_SELF
-#endif
-#endif
-
-@interface MyClass : NSObject
-- initWith:(MyClass *)x;
-- nonstandardInitWith:(MyClass *)x NS_CONSUMES_SELF NS_RETURNS_RETAINED;
-@end
-
- -

In this example, -nonstandardInitWith: has the same ownership -semantics as the init method -initWith:. The static analyzer will -observe that the method consumes the receiver, and then returns an object with -a +1 retain count.

- -

The Foundation framework defines a macro NS_REPLACES_RECEIVER -which is functionally equivalent to the combination of NS_CONSUMES_SELF -and NS_RETURNS_RETAINED shown above.

- -

Libkern Memory Management Annotations

- -

Libkern -requires developers to inherit all heap allocated objects from OSObject -and to perform manual reference counting. -The reference counting model is very similar to MRR (manual retain-release) mode in -Objective-C -or to CoreFoundation reference counting. -Freshly-allocated objects start with a reference count of 1, -and calls to retain increment it, -while calls to release decrement it. -The object is deallocated whenever its reference count reaches zero.

- -

Manually incrementing and decrementing reference counts is error-prone: -over-retains lead to leaks, and over-releases lead to uses-after-free. -The analyzer can help the programmer to check for unbalanced -retain/release calls.

- -

The reference count checking is based on the principle of -locality: it should be possible to establish correctness -(lack of leaks/uses after free) by looking at each function body, -and the declarations (not the definitions) of all the functions it interacts -with.

- -

In order to support such reasoning, it should be possible to summarize -the behavior of each function, with respect to reference count -of its returned values and attributes.

- -

By default, the following summaries are assumed:

-
    -
  • All functions starting with get or Get, - unless they are returning subclasses of OSIterator, - are assumed to be returning at +0. - That is, the caller has no reference - count obligations with respect to the reference count of the returned object - and should leave it untouched. -
  • - -
  • - All other functions are assumed to return at +1. - That is, the caller has an obligation to release such objects. -
  • - -
  • - Functions are assumed not to change the reference count of their parameters, - including the implicit this parameter. -
  • -
- -

These summaries can be overriden with the following -attributes:

- -

Attribute 'os_returns_retained'

- -

The os_returns_retained attribute (accessed through the macro -LIBKERN_RETURNS_RETAINED) plays a role identical to ns_returns_retained for functions -returning OSObject subclasses. -The attribute indicates that it is a callers responsibility to release the -returned object. -

- - -

Attribute 'os_returns_not_retained'

- -

The os_returns_not_retained attribute (accessed through the macro -LIBKERN_RETURNS_NOT_RETAINED) plays a role identical to ns_returns_not_retained for functions -returning OSObject subclasses. -The attribute indicates that the caller should not change the retain -count of the returned object. -

- -
Example
- -
-class MyClass {
-  OSObject *f;
-  LIBKERN_RETURNS_NOT_RETAINED OSObject *myFieldGetter();
-}
-
-
-// Note that the annotation only has to be applied to the function declaration.
-OSObject * MyClass::myFieldGetter() {
-  return f;
-}
-
- -

Attribute 'os_consumed'

- -

Similarly to ns_consumed attribute, -os_consumed (accessed through LIBKERN_CONSUMED) attribute, -applied to a parameter, -indicates that the call to the function consumes the parameter: -the callee should either release it or store it and release it in the destructor, -while the caller should assume one is subtracted from the reference count -after the call.

- -
-IOReturn addToList(LIBKERN_CONSUMED IOPMinformee *newInformee);
-
- -

Attribute 'os_consumes_this'

- -

Similarly to ns_consumes_self, -the os_consumes_self attribute indicates that the method call -consumes the implicit this argument: the caller -should assume one was subtracted from the reference count of the object -after the call, and the callee has on obligation to either -release the argument, or store it and eventually release it in the -destructor.

- -
-void addThisToList(OSArray *givenList) LIBKERN_CONSUMES_THIS;
-
- -

Out Parameters

- -A function can also return an object to a caller by a means of an out parameter -(a pointer-to-OSObject-pointer is passed, and a callee writes a pointer to an -object into an argument). -Currently the analyzer does not track unannotated out -parameters by default, but with annotations we distinguish four separate cases: - -

1. Non-retained out parameters, identified using - LIBKERN_RETURNS_NOT_RETAINED applied to parameters, e.g.:

- -
-void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
-
- -

Such functions write a non-retained object into an out parameter, and the -caller has no further obligations.

- -

2. Retained out parameters, -identified using LIBKERN_RETURNS_RETAINED:

-
-void getterViaOutParam(LIBKERN_RETURNS_NOT_RETAINED OSObject **obj)
-
-

-In such cases a retained object is written into an out parameter, which the caller has then to release in order to avoid a leak. -

- -

These two cases are simple - but in practice a functions returning an out-parameter usually also return a return code, and then an out parameter may or may not be written, which conditionally depends on the exit code, e.g.:

- -
-bool maybeCreateObject(LIBKERN_RETURNS_RETAINED OSObject **obj);
-
- -

For such functions, the usual semantics is that an object is written into on "success", and not written into on "failure".

- -

For LIBKERN_RETURNS_RETAINED we assume the following definition of -success:

- -

For functions returning OSReturn or IOReturn -(any typedef to kern_return_t) success is defined as having an output of zero (kIOReturnSuccess is zero). -For all others, success is non-zero (e.g. non-nullptr for pointers)

- -

3. Retained out parameters on zero return -The annotation LIBKERN_RETURNS_RETAINED_ON_ZERO states -that a retained object is written into if and only if the function returns a zero value:

- -
-bool OSUnserializeXML(void *data, LIBKERN_RETURNS_RETAINED_ON_ZERO OSString **errString);
-
- -

Then the caller has to release an object if the function has returned zero.

- -

4. Retained out parameters on non-zero return -Similarly, LIBKERN_RETURNS_RETAINED_ON_NONZERO specifies that a -retained object is written into the parameter if and only if the function has -returned a non-zero value.

- -

Note that for non-retained out parameters conditionals do not matter, as the -caller has no obligations regardless of whether an object is written into or -not.

- - -

Custom Assertion Handlers

- - -

The analyzer exploits code assertions by pruning off paths where the -assertion condition is false. The idea is capture any program invariants -specified in the assertion that the developer may know but is not immediately -apparent in the code itself. In this way assertions make implicit assumptions -explicit in the code, which not only makes the analyzer more accurate when -finding bugs, but can help others better able to understand your code as well. -It can also help remove certain kinds of analyzer false positives by pruning off -false paths.

- -

In order to exploit assertions, however, the analyzer must understand when it -encounters an "assertion handler." Typically assertions are -implemented with a macro, with the macro performing a check for the assertion -condition and, when the check fails, calling an assertion handler. For example, consider the following code -fragment:

- -
-void foo(int *p) {
-  assert(p != NULL);
-}
-
- -

When this code is preprocessed on Mac OS X it expands to the following:

- -
-void foo(int *p) {
-  (__builtin_expect(!(p != NULL), 0) ? __assert_rtn(__func__, "t.c", 4, "p != NULL") : (void)0);
-}
-
- -

In this example, the assertion handler is __assert_rtn. When called, -most assertion handlers typically print an error and terminate the program. The -analyzer can exploit such semantics by ending the analysis of a path once it -hits a call to an assertion handler.

- -

The trick, however, is that the analyzer needs to know that a called function -is an assertion handler; otherwise the analyzer might assume the function call -returns and it will continue analyzing the path where the assertion condition -failed. This can lead to false positives, as the assertion condition usually -implies a safety condition (e.g., a pointer is not null) prior to performing -some action that depends on that condition (e.g., dereferencing a pointer).

- -

The analyzer knows about several well-known assertion handlers, but can -automatically infer if a function should be treated as an assertion handler if -it is annotated with the 'noreturn' attribute or the (Clang-specific) -'analyzer_noreturn' attribute. Note that, currently, clang does not support -these attributes on Objective-C methods and C++ methods.

- -

Attribute 'noreturn'

- -

The 'noreturn' attribute is a GCC-attribute that can be placed on the -declarations of functions. It means exactly what its name implies: a function -with a 'noreturn' attribute should never return.

- -

Specific details of the syntax of using the 'noreturn' attribute can be found -in GCC's -documentation.

- -

Not only does the analyzer exploit this information when pruning false paths, -but the compiler also takes it seriously and will generate different code (and -possibly better optimized) under the assumption that the function does not -return.

- -

Example

- -

On Mac OS X, the function prototype for __assert_rtn (declared in -assert.h) is specifically annotated with the 'noreturn' attribute:

- -
-void __assert_rtn(const char *, const char *, int, const char *) __attribute__((__noreturn__));
-
- -

Attribute 'analyzer_noreturn' (Clang-specific)

- -

The Clang-specific 'analyzer_noreturn' attribute is almost identical to -'noreturn' except that it is ignored by the compiler for the purposes of code -generation.

- -

This attribute is useful for annotating assertion handlers that actually -can return, but for the purpose of using the analyzer we want to -pretend that such functions do not return.

- -

Because this attribute is Clang-specific, its use should be conditioned with -the use of preprocessor macros.

- -

Example - -

-#ifndef CLANG_ANALYZER_NORETURN
-#if __has_feature(attribute_analyzer_noreturn)
-#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
-#else
-#define CLANG_ANALYZER_NORETURN
-#endif
-#endif
-
-void my_assert_rtn(const char *, const char *, int, const char *) CLANG_ANALYZER_NORETURN;
-
- -
- + + From 8f35291be92ef4ac97c93058a209b8f3672f1182 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 13 Jan 2025 13:07:19 +0000 Subject: [PATCH 048/102] LAA: add missed swap when inverting src, sink (#122254) When inverting source and sink on a negative induction step, the types of the source and sink should also be swapped. This fixes a bug in the code that follows, that computes properties based on these types. With 234cc40 ([LAA] Limit no-overlap check to at least one loop-invariant accesses.), that code is guarded by a loop-invariant condition: however, the commit did not add any new tests exercising the guarded code, and hence the bugfix in this patch requires additional tests to exercise that guarded codepath. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 1 + .../LoopAccessAnalysis/depend_diff_types.ll | 76 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 38e9145826c08..2a68979add666 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1921,6 +1921,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( if (StrideAPtr && *StrideAPtr < 0) { std::swap(Src, Sink); std::swap(AInst, BInst); + std::swap(ATy, BTy); std::swap(StrideAPtr, StrideBPtr); } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll index 0bdcc35790148..e855578e794fa 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll @@ -194,3 +194,79 @@ loop: exit: ret void } + +; In the following test, the sink is loop-invariant. + +define void @type_size_equivalence_sink_loopinv(ptr nocapture %vec, i64 %n) { +; CHECK-LABEL: 'type_size_equivalence_sink_loopinv' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + %gep.n = getelementptr inbounds i64, ptr %vec, i64 %n + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + + %gep.iv = getelementptr i64, ptr %vec, i64 %iv + %ld.i64 = load i64, ptr %gep.iv, align 8 + + %ld.i64.i32 = trunc i64 %ld.i64 to i32 + store i32 %ld.i64.i32, ptr %gep.n, align 8 + + %iv.next = add nuw nsw i64 %iv, 1 + %cond = icmp eq i64 %iv.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +; Variant of the above, with a negative induction step and a gep exposing +; type-mismtach. + +define void @type_size_equivalence_sink_loopinv_negind(ptr nocapture %vec, i64 %n) { +; CHECK-LABEL: 'type_size_equivalence_sink_loopinv_negind' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + %minus.n = sub nsw i64 0, %n + %gep.minus.n = getelementptr inbounds i64, ptr %vec, i64 %minus.n + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + + %minus.iv = sub nsw i64 0, %iv + %gep.minus.iv = getelementptr i64, ptr %vec, i64 %minus.iv + %gep.minus.iv.4 = getelementptr i8, ptr %gep.minus.iv, i64 -4 + %ld.i64 = load i64, ptr %gep.minus.iv.4, align 8 + + %ld.i64.i32 = trunc i64 %ld.i64 to i32 + store i32 %ld.i64.i32, ptr %gep.minus.n, align 8 + + %iv.next = add nuw nsw i64 %iv, 1 + %cond = icmp eq i64 %iv.next, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} From ef85aa98fbabc410ebd60ee5f818edfb3a6797c2 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 13 Jan 2025 21:15:03 +0800 Subject: [PATCH 049/102] [LV][EVL] Pre-commit test case for fixed-order recurrence with EVL tail folding. (NFC) (#122456) This test case is from SingleSource/UnitTests/Vectorizer/recurrences.test. Pre-commit for #122458 --- ...ce-tail-with-evl-fixed-order-recurrence.ll | 549 ++++++++++++++++++ 1 file changed, 549 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll new file mode 100644 index 0000000000000..9f8cf169c0593 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll @@ -0,0 +1,549 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL + +; RUN: opt -passes=loop-vectorize \ +; RUN: -prefer-inloop-reductions \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP + +define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { +; IF-EVL-LABEL: define void @first_order_recurrence( +; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] { +; IF-EVL-NEXT: [[ENTRY:.*]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[TC]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP16]], [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4 +; IF-EVL-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP23]] +; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] +; IF-EVL: [[FOR_BODY]]: +; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP24:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; IF-EVL-NEXT: [[TMP24]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP24]] +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL: [[FOR_END]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @first_order_recurrence( +; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-VP-NEXT: [[ENTRY:.*]]: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-VP: [[VECTOR_PH]]: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP8]] +; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-VP: [[VECTOR_BODY]]: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD]] = load , ptr [[TMP11]], align 4 +; NO-VP-NEXT: [[TMP12:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) +; NO-VP-NEXT: [[TMP13:%.*]] = add nsw [[TMP12]], [[WIDE_LOAD]] +; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP9]] +; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0 +; NO-VP-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: [[MIDDLE_BLOCK]]: +; NO-VP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 4 +; NO-VP-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP19]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; NO-VP: [[SCALAR_PH]]: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; NO-VP-NEXT: br label %[[FOR_BODY:.*]] +; NO-VP: [[FOR_BODY]]: +; NO-VP-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP20:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; NO-VP-NEXT: [[TMP20]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP20]] +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP: [[FOR_END]]: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ] + %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ] + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %for1, %0 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars + store i32 %add, ptr %arrayidx2, align 4 + %indvars.next = add nuw nsw i64 %indvars, 1 + %exitcond.not = icmp eq i64 %indvars.next, %TC + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { +; IF-EVL-LABEL: define void @second_order_recurrence( +; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[ENTRY:.*]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[TC]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP14]] +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]] +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP19]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) +; IF-EVL-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP19]], i32 -1) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP19]], [[TMP20]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP15]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], 4 +; IF-EVL-NEXT: [[TMP27:%.*]] = sub i32 [[TMP26]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP27]] +; IF-EVL-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4 +; IF-EVL-NEXT: [[TMP30:%.*]] = sub i32 [[TMP29]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement [[TMP19]], i32 [[TMP30]] +; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] +; IF-EVL: [[FOR_BODY]]: +; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; IF-EVL-NEXT: [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]] +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL: [[FOR_END]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @second_order_recurrence( +; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[ENTRY:.*]]: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-VP: [[VECTOR_PH]]: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP8]] +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; NO-VP-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP11]] +; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-VP: [[VECTOR_BODY]]: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP12]] +; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP13]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD]] = load , ptr [[TMP14]], align 4 +; NO-VP-NEXT: [[TMP15]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) +; NO-VP-NEXT: [[TMP16:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP15]], i32 -1) +; NO-VP-NEXT: [[TMP17:%.*]] = add nsw [[TMP15]], [[TMP16]] +; NO-VP-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP12]] +; NO-VP-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 0 +; NO-VP-NEXT: store [[TMP17]], ptr [[TMP19]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; NO-VP: [[MIDDLE_BLOCK]]: +; NO-VP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4 +; NO-VP-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP23]] +; NO-VP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4 +; NO-VP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP25]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement [[TMP15]], i32 [[TMP26]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; NO-VP: [[SCALAR_PH]]: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; NO-VP-NEXT: br label %[[FOR_BODY:.*]] +; NO-VP: [[FOR_BODY]]: +; NO-VP-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP27:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; NO-VP-NEXT: [[TMP27]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]] +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; NO-VP: [[FOR_END]]: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ] + %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ] + %for2 = phi i32 [ 22, %entry ], [ %for1, %for.body ] + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %for1, %for2 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars + store i32 %add, ptr %arrayidx2, align 4 + %indvars.next = add nuw nsw i64 %indvars, 1 + %exitcond.not = icmp eq i64 %indvars.next, %TC + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { +; IF-EVL-LABEL: define void @third_order_recurrence( +; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; IF-EVL-NEXT: [[ENTRY:.*]]: +; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[TC]] +; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; IF-EVL-NEXT: br i1 [[TMP3]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IF-EVL: [[VECTOR_PH]]: +; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP14]] +; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4 +; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement poison, i32 11, i32 [[TMP17]] +; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] +; IF-EVL: [[VECTOR_BODY]]: +; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[VECTOR_RECUR4:%.*]] = phi [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; IF-EVL-NEXT: [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]] +; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP22]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) +; IF-EVL-NEXT: [[TMP23]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP22]], i32 -1) +; IF-EVL-NEXT: [[TMP24:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR4]], [[TMP23]], i32 -1) +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP23]], [[TMP24]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[VP_OP5:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP]], [[TMP22]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]] +; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0 +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP5]], ptr align 4 [[TMP26]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP18]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL: [[MIDDLE_BLOCK]]: +; IF-EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 4 +; IF-EVL-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP31]] +; IF-EVL-NEXT: [[TMP32:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP33:%.*]] = mul i32 [[TMP32]], 4 +; IF-EVL-NEXT: [[TMP34:%.*]] = sub i32 [[TMP33]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement [[TMP22]], i32 [[TMP34]] +; IF-EVL-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() +; IF-EVL-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 4 +; IF-EVL-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], 1 +; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement [[TMP23]], i32 [[TMP37]] +; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL: [[SCALAR_PH]]: +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] +; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] +; IF-EVL: [[FOR_BODY]]: +; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; IF-EVL-NEXT: [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]] +; IF-EVL-NEXT: [[ADD1:%.*]] = add i32 [[ADD]], [[FOR1]] +; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; IF-EVL-NEXT: store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL: [[FOR_END]]: +; IF-EVL-NEXT: ret void +; +; NO-VP-LABEL: define void @third_order_recurrence( +; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] { +; NO-VP-NEXT: [[ENTRY:.*]]: +; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]] +; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-VP: [[VECTOR_PH]]: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]] +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NO-VP-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4 +; NO-VP-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP8]] +; NO-VP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; NO-VP-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP11]] +; NO-VP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; NO-VP-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement poison, i32 11, i32 [[TMP14]] +; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-VP: [[VECTOR_BODY]]: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[VECTOR_RECUR4:%.*]] = phi [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; NO-VP-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP15]] +; NO-VP-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP16]], i32 0 +; NO-VP-NEXT: [[WIDE_LOAD]] = load , ptr [[TMP17]], align 4 +; NO-VP-NEXT: [[TMP18]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[WIDE_LOAD]], i32 -1) +; NO-VP-NEXT: [[TMP19]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP18]], i32 -1) +; NO-VP-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR4]], [[TMP19]], i32 -1) +; NO-VP-NEXT: [[TMP21:%.*]] = add nsw [[TMP19]], [[TMP20]] +; NO-VP-NEXT: [[TMP22:%.*]] = add [[TMP21]], [[TMP18]] +; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP15]] +; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0 +; NO-VP-NEXT: store [[TMP22]], ptr [[TMP24]], align 4 +; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; NO-VP-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; NO-VP: [[MIDDLE_BLOCK]]: +; NO-VP-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4 +; NO-VP-NEXT: [[TMP28:%.*]] = sub i32 [[TMP27]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP28]] +; NO-VP-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 4 +; NO-VP-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement [[TMP18]], i32 [[TMP31]] +; NO-VP-NEXT: [[TMP32:%.*]] = call i32 @llvm.vscale.i32() +; NO-VP-NEXT: [[TMP33:%.*]] = mul i32 [[TMP32]], 4 +; NO-VP-NEXT: [[TMP34:%.*]] = sub i32 [[TMP33]], 1 +; NO-VP-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement [[TMP19]], i32 [[TMP34]] +; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; NO-VP: [[SCALAR_PH]]: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT7:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; NO-VP-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] +; NO-VP-NEXT: br label %[[FOR_BODY:.*]] +; NO-VP: [[FOR_BODY]]: +; NO-VP-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP35:%.*]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] +; NO-VP-NEXT: [[TMP35]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]] +; NO-VP-NEXT: [[ADD1:%.*]] = add i32 [[ADD]], [[FOR1]] +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]] +; NO-VP-NEXT: store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NO-VP: [[FOR_END]]: +; NO-VP-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ] + %for1 = phi i32 [ 33, %entry ], [ %0, %for.body ] + %for2 = phi i32 [ 22, %entry ], [ %for1, %for.body ] + %for3 = phi i32 [ 11, %entry ], [ %for2, %for.body ] + %arrayidx = getelementptr inbounds nuw i32, ptr %A, i64 %indvars + %0 = load i32, ptr %arrayidx, align 4 + %add = add nsw i32 %for2, %for3 + %add1 = add i32 %add, %for1 + %arrayidx2 = getelementptr inbounds nuw i32, ptr %B, i64 %indvars + store i32 %add1, ptr %arrayidx2, align 4 + %indvars.next = add nuw nsw i64 %indvars, 1 + %exitcond.not = icmp eq i64 %indvars.next, %TC + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +;. +; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-VP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-VP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; NO-VP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; NO-VP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; NO-VP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; NO-VP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +;. From 812c8b9d234e7e3bf944b90054b06fa40df3d1d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Mon, 13 Jan 2025 14:31:21 +0100 Subject: [PATCH 050/102] [NFC][analyzer][docs] Restore/remove orphaned images (#122481) When commit 61a76f58ebf1 converted the static analyzer FAQ from HTML to RST, it accidentally left out three images (`example_*.png`) that were previously present in that document. This commit re-adds those three images to the FAQ (and moves them to the directory for the image assets of the RST documentation). Moreover commit 093aaca2b0ad _copied_ the file `scan_build_cmd.png` to the RST documentation directory instead of just moving it; so this commit removes its "old" copy which is no longer used (because the old HTML-based documentation file was replaced by a stub that redirects to the RST docs). --- .../analyzer/images/example_custom_assert.png | Bin .../analyzer/images/example_null_pointer.png | Bin .../analyzer/images/example_use_assert.png | Bin clang/docs/analyzer/user-docs/FAQ.rst | 6 ++++++ clang/www/analyzer/images/scan_build_cmd.png | Bin 29669 -> 0 bytes 5 files changed, 6 insertions(+) rename clang/{www => docs}/analyzer/images/example_custom_assert.png (100%) rename clang/{www => docs}/analyzer/images/example_null_pointer.png (100%) rename clang/{www => docs}/analyzer/images/example_use_assert.png (100%) delete mode 100644 clang/www/analyzer/images/scan_build_cmd.png diff --git a/clang/www/analyzer/images/example_custom_assert.png b/clang/docs/analyzer/images/example_custom_assert.png similarity index 100% rename from clang/www/analyzer/images/example_custom_assert.png rename to clang/docs/analyzer/images/example_custom_assert.png diff --git a/clang/www/analyzer/images/example_null_pointer.png b/clang/docs/analyzer/images/example_null_pointer.png similarity index 100% rename from clang/www/analyzer/images/example_null_pointer.png rename to clang/docs/analyzer/images/example_null_pointer.png diff --git a/clang/www/analyzer/images/example_use_assert.png b/clang/docs/analyzer/images/example_use_assert.png similarity index 100% rename from clang/www/analyzer/images/example_use_assert.png rename to clang/docs/analyzer/images/example_use_assert.png diff --git a/clang/docs/analyzer/user-docs/FAQ.rst b/clang/docs/analyzer/user-docs/FAQ.rst index e1147916a767c..58eac783efccd 100644 --- a/clang/docs/analyzer/user-docs/FAQ.rst +++ b/clang/docs/analyzer/user-docs/FAQ.rst @@ -9,6 +9,8 @@ Custom Assertions Q: How do I tell the analyzer that I do not want the bug being reported here since my custom error handler will safely end the execution before the bug is reached? +.. image:: ../images/example_custom_assert.png + You can tell the analyzer that this path is unreachable by teaching it about your `custom assertion handlers `__. For example, you can modify the code segment as following: .. code-block:: c @@ -25,6 +27,8 @@ Null Pointer Dereference Q: The analyzer reports a null dereference, but I know that the pointer is never null. How can I tell the analyzer that a pointer can never be null? +.. image:: ../images/example_null_pointer.png + The reason the analyzer often thinks that a pointer can be null is because the preceding code checked compared it against null. If you are absolutely sure that it cannot be null, remove the preceding check and, preferably, add an assertion as well. For example: .. code-block:: c @@ -143,6 +147,8 @@ Ensuring Loop Body Execution Q: The analyzer assumes that a loop body is never entered. How can I tell it that the loop body will be entered at least once? +.. image:: ../images/example_use_assert.png + In cases where you know that a loop will always be entered at least once, you can use assertions to inform the analyzer. For example: .. code-block:: c diff --git a/clang/www/analyzer/images/scan_build_cmd.png b/clang/www/analyzer/images/scan_build_cmd.png deleted file mode 100644 index 464fd4e129a2088c150577a8fd52cc302bb90ee1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 29669 zcmX`Rb980F(mov9$;7rZu`#iYi8-y zn|$CHZF7G#+aLfVW5FyY&IhWZ<2u-?m1g`OoT7I- zzY@*>C-=kh1W`zEU}j;sxW4|T(?sW=1X35<6gy-)5!01K=R0R%En-w~*r=@>%c9p} zVAquS`tmo{u-#WCYyXxHu}>kv&M8c+N5#lT&V)>39+_u> zV;*qM`tHy}@95)jx*lK9;FtVy0!0MK*^4cAyE5E1H3O$_ZUXcWy%QIVdppF5rg( z8kin4Sf6uSJNH;yJ0KM@z#CRXDnLrya=(s6pS4jyaF)Oe;Ka1q5@3B6ID!TB~qxzpniqliiQ@LC|6dnS70n~sxwr4?-8&RXhl;OG$6#wJQjLz%-|l_9?%}JPq^HHx_|a=&OPaRsJHL|a3s;wgIc=|dn9|p+W_SRT2ERm z+8A0rI&#`MI$2tKWhB)YHD8(pxE`oq{op}3WMK(%9kR!yJ(P?Xtm2AA0<#gvKLjKv zD3;K=BOM|^(v%p&RwScYSmLr|sY&+9VKVu2jVWfb-(=;+NXOL2*v7cV(o@FODe1}R z73gWIXsZOOysN;g@T@bkqIE*oviMT?lDopX^1Y*Aigt=rN*hZfOGb;WO1$LnBGW_| z5;YP)q!1DqUb@lvRLLVrHBdYBSND(;?fT z!J!wnHMSQvf7%}70pkZ_zV?ZB>d#zlOzmpz@SlCnPEA&gL?+|r-Gf|Xalg`y+RfsP zSu9A*zJ~rx6%L~Rb_&~L9!8@1R!&~dV8Lg>e&oZ*IpsriZeh z!GtP_e2)HBCPjBak3*(K!u3at+=L#Z_IcY%5l?+bt7n&I_j6Q8L`buEo_OB?iwv)f z@Tke?k*YWyawUbTo2uH}`P|{0)m-kJ4ci=>5L>#L!IG;ri+X(Y`Yqn<6bF83DoNJC_m>zIq3tDx_;Nx2Cw zNH2&8MD#9x^?mh!eR!>gse++FdPS;4szpLXrim4gMImU#g*5fsv)$98Hp4VbmP~e2 zQ>=y~;zT)<8KVi~D-#~0K9da-K<7l~ zKqpo^T31TM-73{Ou;Hj3wf)Myz~kD{(pj8WhbN7{j>n4s zfPa#Qfv=pm&k5Zl(hbk~zzx~s!o}1L(b2}E+~vvH;ephv!&~oZz=zM9%iFl;9C!+x z-E!Z05V9ok$%iqlsV{a&CJD`t22F~kgbSdyzRbV9y?MYkz=l90!Q4U*LG{8VA<4qy zqiVuWA$}4T;i%#_Ijy`piqWzz@VBtXP%zTl^61eEQ226*G89pou+K5y9-_ZekVz3s zMM{)POcF2h>JTGKbxXFB?D4t3uXJYsSwmR~hKb@pk-y?&BmHpNS?YM~`L{TG&!9k& zC78`L$cjsp%2XkgNHp}62GSey&I+fB*s@d7pGhW}b4g>F-sEo7lRUMoxpacmer#gG z#g|xrD^1I(mCo|(rp=~eCUd6PM!OOxGjmgb8K$}Oc?>z($=!KJnNPRxu+7Rf91S6ZGF7&A=2Z}BOW7|AYvofAza~%V(E8DcX{6L+BRx9JyB)KQBf^W z4a*6o@};1pRiZ}T*J}}0dHSqAJL{I39hA~%RV8aMX!X?fbXj}0*6LJs)r2)t zx0>(iT(_dOF0}@`y`5#9g`Tl+&2W`)*m4ou=h!)0TDCg)xvw8Te(+m+*tg2GYk&^=bbaF=3=H@+d~{&DDC$gs>`gA63=KAQ z>)k%Lz2hGqg6@J8Ky0AWoBSK`Bf+DvXTei}hls`YXTg%uve;yJZafFEo6wT)#)J5o z%u~Xn@|7B}{9_s^UyzWCh}hfy#&XfOpt@NIqP+kJS=?r5)Que_?* zIsjVu848RE>j+Y_yW8m5^;qsc=?bd2fD*!K$deFt)W0YxEG3fooM;hU|l1Ji;nT}-1H$7z;axenVZ-H9Hn z;{+6krqG!!^w&4Gd_?jD46L`fgaUg<{UyRrKlmZ@pyHsJVCsp&#AJ+Xb|BE360N0> ziq7L+CEiHpV~yiqoO`(27@4WJK11eY+Mp7yS#PXVrJPM=e_1=}{CMq*2~rC!A7`^^ z)@gooHrQ7@^5bvL5(Yd@c1j)bvuw~>(%aHj0e@<3c%Deg=vry52H%!{qez!xu&6q$ z^X+|Y^=>skSX*9WaRcF35R_)iXMfbwY|!?4-7Oi7uP=qJ6WC#S;9VuIOYoWTFLohy z+g`1*viPy>;R~JHh;GBqVmu%e;D;Rx^fY=^>?jptnIARsPlWp zYn$p-X&-C{0=qw5MF=w}&Q*m6;JiehjL15$s6x6zYlQLGskem`X2?7rH5T@fm2?aGJa&iMt|{(c`=35!XbIh{b0EnLztLqcWcM( zyzMN`RZx23subO)jF~IzB<$TTbBX`vk@Tbi+x;rvc~Icr#$hd?vS7WS!^2C6nBnXY z2#^5yPV8(?5rC9Wl^sjp`}amE*4SZ0Q_Q`fT_Q!?Jari~rG42VxgjctGG7IK>1|<}@%(0HdvjGYq6?dg(!N9v{MZxknfKI9&joZg#F=O{d3ChrU|M>OY>h0_IOA9? zJtH$o1xF=Yhlx99Q&-dBP3r^SrT0hJ_7JeZRT6@WH_pSoRpMr~})o>Je z=r88keCvL$|Ma$@QqH>5xOj6Jc`h@IN>0+NN@kf8x9Iy5|Ikg zB3ciP3B4MtkKRe+HcCCzHB!FUwn?w02SiK&Z6_f^CHbpBpqi~ z|BW-W6}Q#jO6g2srgm|BKAy{wZNp)}B8D@M9gm|{!k;}6R)4~csf01JR#{VSNy(0R zU2?^0Wqu`|pM#H@N1G3k=gZaWSmzx3u`@+--+u#O6wh->%El{rFdGP75dA%uVWVx;Hj$@A#OihLlO|0W!hsTCK zW6q+4{rXQz21X6x<1rAEA(9tWE7XS62<9;6?byWG{d2v&yU6c;rg^nlqR=1d+uIm? zX3o4rS{zI{T5RWwvroR~;B?UeQ;#mwzD4czWW#f-0}zGz5jRn>86wJ!^F zaT#>$#akKKq*$|AK8m|%Hf3+Nm+8o~|GHdh&F#_kxl({FQ>z*;RV|$2NrJGL!~AErX~u zq)9Zi_*Qwc!gzt3gF*-0E?y{(SxQ(0Fz85X>wtTRaY%MZqls%1tnjDJ=*W?fhh2~D zlDU5)Y7cnx>sbW>1pzjSJc<>9`5^|4^jrz$9D%IW7{?eLJ#rOB6_d5r zsR2HG7JEwoSBQJiRMI6nkjsz%uR)lmq{fgtwP2aWgyIC^$W=z#1AV_mTocLB_RjGR z(st3F_1+^HdyE5JrX>4Ulo{qe z#6j(21^qQW29{QeWGbx2C7Jxzx?l7gnVTjuy239eOgo%h@w^99>zw^!gHfvH7+;jEA`E;Z_ z4Kx`{l<(I@8AhJI1LG8PDppd$gouJuIQ)2kmw)9JYc|s{>$T{Zt$8Nx2i{b#(ie>* zYZeadHof!S1${ovMBM}BNoTtbpusd0XX zt@t8pq_Qd@YJgBT{*9Xd$HCCiTk(aI8ekcp+FVCtv0)`*dBEd_M6yu&QqrTT zMLNhNm6*-pma-jak7>{|RAF0lonkH^ax8J^x7@a9xDvX)+;iOHAyI}QNzxDbQZtv~ zoBulW$A(R#TH@dmYV$ddJJ%&F&Ed;&(aYS3+9=*fBKFE--9i$w`22wA4^j_e5H0Ta zBVU$?9#9>mpc0@&m93PM8@(Q58&Me%yA^))RmsqLuiz-}sc34#X^xjrucQ2+7u3Vrqlem4DP^xTfmQNq+=@-I0c5JD7|1;+8hQvvm zCPcA~W{vtoo=#p(A)hBTvy=PGMNuYCLFjDtF4^|>{8@bWn;WHVm(fDnt7j) zA-t~so0Ff%2Q*r%EEX-e-!^6ur%C`}q31Ch2OQI1#%uo-*r_1h9Xc#zS>$g4T@l_f zVTpa$r1#(MVJW-sf9nnT4#*n)ZSZi!&Ec)Xdiqu$%rXEd{$5x?h6ev5KK(B4Ud`q^t->=n6c)sJY(Kr>j4_1@SZ9-2)x-1?Bk-cN}VwM$}Cqhu$v~jiLbYa(RSE+bs zc++?oI+41(Ivjk`ysO`QujQW75i^j+;8cVJh}?xlX`DGvWn0vcUOgppjNf-hH^g5` zJm}!9-cNw#@9cEV)-LxhG*bh42Yen!K^lLYRm>Y#O6yeHn_{EhWkahhwEV2UFEcNU zE~0Y1a@Oik_tE*E`B%?hq_)NQ^k1Oz`E@xxF?gfifA}g3SiZyg`#ME@Slu)n-#wz| zYdtU5I(&fCVwYmq<$Yh#csMytS=af7zdN)gTvtB2Twmp1lT_Le+B^uC`fJwT)#Et)CfFi?%udRt%T@w3e%-!meA)Pon^n8%I`Vs1d;n}3 z-1A5R3ceAd5z2Bc&24==e_dkWq6HrJxFZrdQ$a&#Tv~Q#Bs>k1ddx#g15H7nayI8G8x1Q}f=II@oe&KqZYbw-*| zXP3c`W3)1+@_DGjH;~Yg@e4%H6$;T0HQ^4W9HoAigrWplp$UAuu(7&}pN=X|e-w7d zeMID5iJT62pO-kgD7U#@C&9i)eEcz5IZL3(PBb)2|)1- zt~9V0zlj{#G2h1C{T-(-`77>v*pz1La%GNl&SfqeU><-iK3?=hnz=93g0F*@Fe`7# zMH-kU#ZbsO^EO*0oy~U3RGu&*1nAvUeE&^-HFEbyN~6Z_s}uGTp!3CE{uKlF9`Ilf z)h8?zFTDABZvhkG@2rIxv0)V;u~bRx1nYKGKX|q9)g9s^abXD^sOdN|t1L_ca%_Sr z%DTs(A%OCdj!OPj`m5y67ud=u@^$Lj+|j@4J-nl|whI^-BIf^lz`?R|aKXSx!DPgL zsQ)Y9`@sgNFFt%`3N`y_8(Ga|GZx@}vk|AjZV;DHSg05nNXjtbur~2eQ|DwobU24c z?{4J&Zo*N4D<|DhSU@V_z4>eWeByHAefr6K>U?uJdv)~}j`@zBr^}dNd(YKmufwI^ z6Yxlu5~D;0T^R}sTtejE;7$Z_slVLPRs%;15fwmQ@NW$ ze*53Kyb!)}m-Ze?b=BSH$r$I%U!OLx|FP-V9nKYJcS?>;bN9j>bNey$;F1Y|OYbyV7fR8AUD64G>)_Lof*#P=PL40^-dfh8TZ#iU&U4k#zs+2&Zn|e0wSI zM$xT8>AzB>eP|<*^0CYo(bH)b8ja1;o#UfCuozR~S()!aPP#7`u&a4Ln{!V8hg&ys zAeYSiv59VD<}Le6ZB=(xN_R@kI;jawkn1h{(8cIDHp4@u`hOui&VyTtoVU;RS={Q0 zxzU#y(H9-c74Of4(|9LWn904@+shfN`ae5_(MF_`+Ad{sq;HV8H1Uq)xVbRUq;KrP+}!Xsa#d{_{*^ zK(vCqF}cisE%P1wdztVaB)VnHHkBB0VP>&5bI}Gy=AU@Ye^NGWplR!)JzwATpJ0g+`6M%<5xf7MJw4$v zm+Px)Q6?}NF3z@>M9jlm8jC`!|9~KoOsGJl&tZ4$z8;e(keFn#wi_(obj16TV^Y)q zTKM1aZ5T&KGf8%7T{+H3R^rQHQZgY`Au%a6@Zn`gy(zOh9XU&B5u?~cDmGcAbDddG zAn<2by0=xmJ%9L}M#)OP!|fK?Jg#SQ{_FodJ0^q*!D?RoN0~9JjzuzI_#d0JQ?U>J zji)@g@Md#?l^iCb-eV>=A(K#h^4OMClLz0Vh28g0e%i~Ocp_(WK9!C3n5CfTaH zzJv>VVFdOo4zKqon9s1kD|?EIm6enK3tRws0zCL(v_#I+#>W%r3uO4di*wc78`)01 z6(Ua->hZHRX+P4?WFVe&KeJ84QA(JPgU$Y^;-@>p+`_EKe46jP*5kP0=k2!36z@^e z(9n>Kf`S58{IG{>erwNvkN9mL7qEUooj)!0$LRVYoY?z#%BCLpXt(t%CMi^!Y!W#l z!^>pdZ#P^#>AeJ63bw48KRPB3vLLN>v8YOE^4_Ar8(FwWMVGr0V^3 zze{!KU7Qot@W188;x+$&UbJL^$WVwCCogyI?7M@hvy#ZM4R-i`fFSHjIV-W|*MQF8 zmdSU!mb<1fgjZH^fjo}#Ba3hMyjNa4hKE~8DSm=$SIWS}h5dKgnnJeaY7TG9%e+!$ zXCt~@A`Mzep6O#5*)XBlprDT`DH>`0vRn^|X77Bsw#8lF}3?u~K+ zgvOid&5auEO0!4z*+!2Y1+a1dn@-cGnsp4fk!&zT9t8fP4-%1(37baB{aB@9d2jXq z5=8+g#5Ve4fK86#JE{7;UAz(rnJTVi9UsbUuzQq1^=U=p2w{J_2t-cR5}8wTo4O>cR~S zM&UaKvp2zWXu5_oql01`pyTp}%(rbwoM@6!8b8KuTbRE*8@k}rU5WU~XOd2;NtsxL zxD%Bd89{|Ix49I=6cf>BM*itL1hQ0a2pGCQmZ69fOruv4ip22E_4;U@o;_ljC{sEk zsKHg|J8lqzUq)x6vPt=vB3O98cBs4xFv0)cn=yv^M*oe(9Y47-{Pgn!$X~~p$|gKBdmBA&;ucdYUH4>PIoqAyV|X_tRmv(J-@2=| z-0#iKV5jF4;_XZ2>@yUDRl|YqC;a!y{perpX)%cd(iE_zb z9-i93VV<7;YF^|=6%`2x8W?9vAQ`vA2g|nvdvur;d;bnW-1)>%kSF#p9(T)5pS@pg z$EC2WP}mC{igug8INwC~55;0ujDel`e}HUDs~7ATKTf%=zjLYue4*C@p15A$x-n0R zNEBPn;XvA3(O2Ic&R|zj_I?YmZ2|6nRlR>IdGs)OM8G(OBfKn2q(7`<%D17P?{jrk zZRw#ZUx;>{BxRoU(5gK#{>-0ZG2ozoL>cZ%m<&@**ZHjJNC4e03Dmc+3L&eD7s}h> zqPR8LorXqR-5CDjQ(V#HCwWOoHBNojh1_@w(ILprhgp~#b@vR25BHl~= zMs#yX9ejB^xq3!#2K|i2!{I_cjQ5EW4l6oM za=23p0j*@-ht}&pzkzj4ZF$C#S=Zyx*Gx*aa6xPDSC~u1`&5DGZ}Gu>5r4Bv^Bfpq znuHm`;rZ{-Xx35+C61y?63ONlS}uWanYdR^`@EpP>#r0pl6EjKABcacK@4`Wv@yHH@O6Ax?0ECzd}Z$CflUb$ zL2~r&%++a_-TSKe8TTzJri+wyUjeLrSHPtY;#A>%^16K)=CH<3bOu9U@Mnu9t^!GS zpugvbAIE(WztvDBc-LOidgV0qyyaQ)^2}dEp8FZOlSUir*`V&SSp}AYW3$)Ezv;7N zAD(dQ&txC(9iW+O8~gITuqL7pa_!F|2j%%Ytyw`!kN#EHqtz7Nb@iBjF|0-svmA2B|3);sC8mAz$I_*KaryFX+b(UdZ0yl*>_ijR)h z;*XE|y7H)Nxm~DjFTNm{|Mq9(*fkTlzK+H3%AsL(* z3K~77<}JP?t{!Jtg|8irg{w**D~SB$%^OLb>wSeDg;Egc4K0>%E^_D{XUkieLiBozI>SYOaT?J=i@p#K0IH2bn%y|)9L`na7MXZUaKql zpk7Br%HZ>>!#CyAc}mG@sIOO+J6|A&#ae2F-x^=gZIGm*=F;@_!b-RZ@6BaPCI?69 zL!Y`UDduIE;K_Ta9v}gFVY)H|;lGaG=g*;*f1BT{&lCIRyC{fXU-I>|!noGXGTV2V zA-in&iB0r!YNd{?D}(wS{jIJe-9Y#kuHCCG)u1f*>GZ zS|+M~(#o1d=>%qjnHb2Xjs34tgPfS%D0s@h|3bP|V7IE8h9y2PxaY;wEXxgn_2-Yg zIn~?LJI(%RM^QTGr_$|R-O7m3m9yAuPBT=mwp$9NPWd)I#1Arhq_7GebrFB+T`&i- z`1~OI1-$a_O5>aZp5hZOE(*j;!L0N~s#BUYI2*M|gQgQBD;AlIC@gO*g8e>TJBbvu zuLqL!ARqF2RL`8W`uf5f-&HbuLFY%C`|TdktfhmX;g?N8SXjt`;8Pr6<{NCPioPkH zu_~L>4$=L7DXccbSteJckiGwU(d^Is=rIVDI7bTS{Gv3>997>16ncMp?_)K(#$Hk5k$@un#J_J{Q+V11 zLqAxMAXNC^W>{{*@>SdLuth^MxBKy?vbg1=My@A^tPt~~YAfa~);b1jt04xvzOvE& z8^=Z*y}dBwId_!gJv?QXEpL#3PTX#HG(4+VA2UbE7Tg$LFZWvCRj8m71Me6@oC) z$5&p-Y__%4WWUx)70BSn)8``r$K9ZVnlNIqt6bH29$Y{~s7pgQH5npnGzYp!=7LPW zb2QUj%tgc2Assj;zaC2)cZ4EYBoqmBLL!_G^l93XSka4REzq*vu@(FAeyJeH~A)_+;SEwVA#3mguTkINeX3SV6&e z&2x&WEtaj_iqXusAmpnLK&~e%D6_-%LO#|Iw^_E`@2jFsB_0HK`*B-XyI{%1T)e9O z0vQHn^#EfqH%Xz8V2I~2!$-U>RK)M^!anI~hc}&tO7$9aOgq}VJb0j^8lu^A#|G>n zlRglrOcLarTN(l$ZvdrKBVV*aMJqp{v#YE&%NFt-8~QXhv+R(uu2!YLvb^~9{npvf z31t7?$c6NEfXkn~hRywof6wB|dqC`Re7q2U%aH+e-P+>Eo?Ns&u3Hx#6x%<%-S5`0 zRPN69Agr`yW{Y^^B`v@DKJDD+Dbns zOnBZG?v;(q$ZOF}=)%T$5cDh^UNA?^+}zdvEqE_p%nn-$hVQ z|4`~p&CdyrB^R64fI;BhMWm!t>4tPQc4hVK4XL_dCq+LBCbTV0mxI6mPPnhYHH)cq zIETTczWG9zo74nGR_4U}olcjuHAcb$Dl(PEu8k=txB#O`WQ?_@v?Os&k*^pq2LF$pPx8ut?%Oz$HUjDmxtF^ z7Tz>?@<}~Ou1L0!S8Sco9nJk-`BT{{&h<=R>_^(6#G~Lj#Os}t!KE02(Wqkr^QY6g zUSBAe`dttr+(qHl^Vj?34@&f{c^3avS%n7uYbca?u z2M@l;?juIG&Y2_bE=C(V^qrD~5<93{<*jE$3w=Rn{U35m6&YvX=QCt4lr8T1oblFv z`}i(cgPu1UWgI=Oj7$#CEOt67QU}EdqgRi~ph3h=&%Zh*!tpu~zLRWYpy1eX?mH~E zh_Lk7(z1q^aUr#f?r>q=87n~rB3y4MeGxzj!E{faDBu(3LjR=v^r4zgRVW1J?!8(K zFB2qxeIuht#BK#y1XU*F6oY$I^0$j|TPC(aqz%I%RL(1odp!J^;m3jqgYZXyX1#B4 zop=1Cf4qawgFC95lG8Yiln6(c8}|xKj2!k zzePd-*_810m*a!?;GnXnl`r7PEbR4watb{n*Nm{?8!Gaq2hsz>iOe3)5m3ZQ@y+ha zRq%_{#DBAJA0b6eL!&g5MK~}5M~Ya~^^6eTW8~-Qs7mSH=^p3SF0cSiv|*pX7F^YPaK2=6TP+j zyi8l17GV%H)UQ$7`MkeE!;jgSKWl+wY9K@|j*BU{60cB?IlkcL5Y&B!oPny+Y>dVc z?qoOF??iM&P8j)gLxs3XZG7JRhbwD~|9O z`vXM`%koVG&ktSA#o4x4WLN3@WV*Em3t%aI;^?w0)+Q2D_z>VRKJW zJ)_`S%YZmrNtov<&uoxls!$5>v#7p#d-mlYCICJ_Ze3CQqSDs;UjPbM7Hi24vVyT;$;5#^Y;HE4J;%rl{a6mRd!G5`usrQC<9P)Va=DgDnAPibC086B+Nawiv-lau#pTNOyquJ#Cq2 z&B!6UAJNy(m7dPDB?tTMlM~a*81GhXi?UA-Er7}uu)^ZIXtk#OHVK{N?Kc&WkiZ%G z>JX~*jjFK>xCB>i5wcqkZN)O!sRml9215+nrOBENX+53dAlf?QiaV!m84LHe?Nu3q zzla=be4J+8+Q8Ep_PofH1IMmz>Jg|02#Zo+WX4{rXE^>;{Z!QZ>m3aKus+EoPbapk zf^P0N;$4;{5vq^w;;(bb^2#lf)6MD_7N-u+hUuq{yW3FFODRsny2d4XFT_ei;ogTh z942oBuM4Oz8jw{Xy+I!}AFLaCB1N|DcP3sKI9*2<#FK3Oe zgl+@CH`sK#cd#dfv>(LsNqqB*$zzjRr#%(5q->KoP)(8d0BUP=+Uu`_pD&q6-Ar!D zzF1To&#|;S5f_(Ct!*A1`yFLjs%>(Nop?+?sgkX98CD-%XvPlaX3EUuB&Ef(l+eS1 z+H~_oh8f9^k%OKlzW2|!eFr!-a1cREqLY{y5hp7v?9*ntg9Dr{RQQ-@pL11njFy!a zOP*Zevok%zpVdE;^L{-+i<}8#k_tuELYP4tiO#jn4K)eab{W&lz_jGyWy{#E-C5Gq zXPEhntv5A4H$O!;o!HACB*eXC$3wQFYA)3dAIyQj4-R|=otE{ISg*GhJlPGC?eCQ8 zT)fYW$>W?%nGM8`5BgFrSa&q$H~xrMDKQVl)nBTrovKEKLLC(%9bjzL*4$POhs<9WDE*&n z7O=_I0rs+;@^g`Ywq0)d7s(pSpicU^N{{V??jd81D0S8TE~h^)`=6{d?(gVk?^aea zp@K{a>#Bj`#`sIq`25uQ_N%cNv-(kAr4`emJMyUOTqcKViE5ey z!_wl8_#04U`Whhos`vP~Z>>R(_w$uk-PJ6_2V_QDCA9VYmM2Jg7QO74fP>^n5=&Fp zVhx-kb67lncIGMZ7O66xJ)^k1Cuy|Dm0avC@IJW4C}vd?Fbvm$)>O1`v-ru@9*qv5 zne8|0Ju!DlKFc*lA@y`>!==d#u6zY1zdx-4XvZ+UZdUVU3cGET@6x13F$RNeIod*9 za=F^>KbrqXrYFI{O$Tr&7zB~@X(;3i!f-~Q@+IZ^CU2qmyHn#KMrP@w=S%Nr@-uEr z3sL$uwerTsAyAE4;!tMM$qm?3{a-JDjC`4Ix8d#`P|>Gv@qd)4WoW>4G*75}X!37< z(T?lPmtCCv1>>Q}da)+<5!~Fw6d5U@+lfg=3jcsWsC9BPGO_V$lK-*4Q6j@=2|x1C z=xsZbp)BILc3^UaY2z+i;HC+0{1gdVuo zs{cpoyTgGen~D^Xnmn6D;im?BgcuEmp%P%+p0jEW#re1JBHPxKrA-wmoa9K))9}X# z%#yL&{rnLtpcW+*_3?C5_CIdT2vur36vu$}7}O_>jK?<6CCE93h~9YpE!v~;`|>oZ z7s9oiA8F%`MrFsq&*^`i&CV8VzR_Ng@WBC%k@m93irHv9CHfZW2gw4`GHFxIJ!ML_ zb4?T!rpYPu-&V2sWhiugRkvE2Pui*$|3gOq4ih0)`CZLUU~9nX!44JVn@w*|0W}(A zYm9?~qh`m{cOC+dC(cN)D>6g6hlOgshsTau!MjST_MF&+!Ei)TSBtxltcaWFZk`k; zn`;t1@sC-ZPt*NE0cJ>U`Q>|DYrn0pCqAlQfx=?IIj#Iy+*%rcySAJ3O|f{ z@@V7%n#K(;iw@)S6>9u`$uuzvA9?;xfqXus&Rfgy(=|E6!K=1SRq6RT55btv$ElCr;vkaF z-uA2IV)I-z`}guQ7Myd;D2eBTp!a2<<)K9Eu%2Kwa!&c&{MFYWfON9h;oL{k)u8;I z^+^_`dxv-VFNlET6+|+PoPXQRy?$(S`Cz?&8UOnq>u3?{eG{b^6Tpg*hFpB~j~5<+ z`m_ISHj#yoGHIk_G0DN78y!;DWKw``nR!*)?0kWikB&&9cg6iA1vdXZDlm^X*Ap}K z(aC2|?h9T2Gb-evJjx*t9jOzOc^UlMq>bTA=0ixZ0q8xk+gVjuU@}br(cWexHAXH@ zU{jZs=V1WV%4Ce=wEMa9I6VE6`2~*p#v|)o;&Tg{IK8{yTUfnZ!=~P{|69xSve>lJzmO6OPkk0$AaHB*_8x9SHk_H0$fH)eV!hlE+{jZVZP%3Zetlk5 z|F+OR|C-?=K^3U#e<6nca1Lb9nFi+LuU?@4=7sGEI{UNg3CC}Jh&UUiWIN!dGbTGU zw9DiZe2n$~P;?^cdG#9C*Bu6Z`;T;ye0TaLZ@n3E&+04w+MliYUQe?rW8ALo*jzk z2U0*2pO=I5iw`Uz@V@OFyCb5T(HFhliwCnNbccEhB1ECdwg@?C7X1QDtkoaQ^#9V9 zWqJg+pE+JLvsk_&)n5pI1eQn#|38jmgWk3b4<${bQXQ7Zc<-v<{+SJlQPX|%thAs z02QU{SZ8^K1_bAOkm`Z(*nBF?Ks5jL3cLB%`>tszyw!Vq+b3#j{~kCCqs6lm&hePJ z9P}n|r|LN3F@=23_wrM?56iiG;cRYQZ!iDL1I_NAo_2(xO}}D~w~jrl=gny!nx1!i z=~q>^@kQb@B&Lh^lZAp-SHBVXUcdZzDWm)&L&Uf@K%0=$nXZE$FL)QqIrn0t@VeW~ zVaIlvEK7vv#oI%o!j1TDGe9%^tD6z~0cQt(^g>3%^@GA?Grv{`N2kxp^iP&swP-$z zFU55?dzEc!&JWbdkKf(Sn`LGjRR@Lb`ESKiMT&hp{>!?EIIx;^a*M(5tL;80a?AZ{ ze;k+}{e-?GzCd{yN7{ev3V{b#)3^EgaaQqj!Tx_fwCxs~+qUPqNgfqsaO#V?XP4gM z-*kP$XM?W_K88Qc^ND>e3a{=45h2_E-8(*$r!&rpQtFCZ*=T1@tGD=)QHp&ITka2v zxd8`fS28VUJcN(UH2V4VKhfW9M{l=ZV{H(H^1sTy9-h))181wA@)fS6)_K;u%rh-! zKu3V{FO$Wj8Fp@Yg{QL~f_zuM789cOaMQ-N?T=1&Ad@HWVsWn^I5^`}Qs`JwwB0&j z2kS*KQas~WQPo$K$hKSQmXC3sa&`xQ4MgWcGm(q2$;|FyCu$h&vS)+M$#0-mtNSf} zp~K{lU;N#}$Kj_v>Z0l%pZ}s#RHcpZ&s4`3vAUqhTKKZ3Sy(UNd3>$Tf;ZI9gieN$ zo2QebM=WILryMRSELsK6EG~O{MMuHV;(rY!L%rNWS0^sV5e36UPMmft16n)?wD+ma zkZ&6%3H}Nx7_*k-=nvFAf@1=`#0T%xhf9^qoJt+AdA>#?IIwwu zB=5NoOG{?=l^o_v4~28az>mw3A#u->CQFwZQgp&T2T~vbtN*ed$3um0?;%D!E60ZW zhG(h1+_>R&cj`;&II#n4*<`e(xS@YD#B{~hUA0DI?R!vg>)jHKzt3ROA!41*bnIcC zV&9&@GF5VQ(624Nyg2=Z`-4O2o|31nyiqB2JkbAF+E<3P)otAZ6`+C^C{Wyr7x&;0 ziWMm?Em|Z{Ab8M1vEc4nD6Yk=xKrFc#VxpVbIu#i_nmux-d{Y=ChVQP)?9OpIoBG4 z!efj50wIa$!mG<~k&Z#bglu6pVFYyF$?ZRUQYgr8&r4e-U2F(oP_1NZ29J#93fkhn z6j>f<9T1qx(<*vp>vh1c+@bH~dVwKgXW&l&rNG;32#YEA9{3x}|Pb%pRSy z6{GMJRl6ReSO+ZMUio=?+M0M6!Lwc}Hpzycethb}YQOz!ux?ySbvF9_vuE?B&4N0H zYr*j^NcPAUMECWZGpW{hyndeN@zKjwfl>x1-#NdFO*6dMXw4CiUA&0$;=Wz2hn?;p z-yM})uRjmNIAsIYP954A4uZ~#leW`2ymH~Gd^@Th%Hp5Sc4xTFhu*lIPH1k$Y8sTw zbg(y_M2+pw)k#}-B-KwVIhq`0-eA#PyL!qpRUW+9wm%ZMq#K&2f}8G-p)@C>))9M6 z*ybiZVNS+t3OhOL{I%*x!98UgRHG2YcEJkL?Jixw40X~}AWgGm;F%381zI=XIwqEg z(ZI;SeuXA&xaz&xqn_yJFEhko(KTa!sTt$=o7v2GR&0VQhQlrLCu* zzKX;~pRMjJCV#n&4d8{GEL0`|1ph{KS?7+|*LQAn&7xZxP1CmnBC8`;DUn7 zCdk0ivh+J{_F3o~`wMYJnrl5=gSN5WPUEnn)HF-xOXkc(Kmz>s@ZuIbEKJ~R=umLz z`AgAvWMgTIUPl+o?Q2+qMHcfJ%p{^FoN#QR(esA(XdbPxj-sTo4mrcrQZtAWLM-SLR?2kNVAPM#M=r#BGtqu-kpx&1{k&`Cn>XrE(2k+Q?_h0m z0}U_M&QdmmhzOT&j{wOw+|{NE9DZvTJUjjUi$c~K2QYd!6 zad<)Ibpu!WFfAZ$eeBK-aCc>Bi5@m!GjF(UIdmv-YY^Mw&!HV-@MK8usEO(%5Ghj8 zv8v#dxG~UN*mYhbh(4?6Y74!0FV43(^wSU(;~ml+R|kMnYBo2cn@(V?gMZ^a2mRA$ z2Se7S`Em=^3(Wsi9Ua5v@}ex6wruf zJ*USu;fX{+AKRTDJw6oS;JXn?s=gHAycIH? znv?G`GoRbe=m_RsGnG|L-!w~8(+;9h;OZV)4A;}5TuW$trF)28P$(dxp;O&X230Uy!Mj~ zq;nt-v}ZI~?z6qbB|DxUR=i-DYKK3^`|D~CC!BW4+p4gK2?k>p{pu&r#_7)F7TgDK ztP&-WGWyw@Bt@ILrrhEcL2O-o>T|Pve0jIC`1@3&){lZ$j zH`kX+1BWlR?m|V~-{1y}{4qdXJbyZi_V$e$!C-=Dw^tiw%js=6FLy^W$MMQ5Qtd>S zY=gfAt|1(}Z7yoLVx z*2JI$wXnshPDzCVn`QY|QH0GB%$KrHy^<^Ln&fI3;qtbN$Cc9AY0@X)-l23puswycTfTv_V8wVF-Xg_hNkwF<7PqYD5?A1bA zsHAKBb9(Fk9?%f`xLioh_6yAH1%b{5TJyyj(miDNs$+Z%j}*u*SL3JmOf3t>^tDDU ztE`?LX(z@h2nnfjf-yh)@&6g9I?2nwT04-shEm}CtUaQSZ;7AVDY)dXz9is!Cs#4S z=T&pruaT|El`CCCA_)com_pS z3v-9P3s(G`5Xd9oB((7Em7W@O@&iBm3!PYbQb+JT2;NaEi5B!jlNEEy%eM9$wgR|c zQ47{nUW{E6>8>ovmVBx|<#I(Qn)&TaXxal7iGF<@k7b{J{7n}k>V9LN)|Mb!$a^&4 zZo4`c#F>-7lqsjsvCn!Ratj_Wk4)6iB50~wJze(Pqs*C^Qj~j9AEqs8m6X)Sn$`4< zE|ogL;PzSZ0m~Sskk!xXk=U=XS9(pAlM`VIR(GGTFJ>s%g#TCAYFY>$%h*F7Efx*RcwMBkQf=E~;IF0f{Z zd04W{?Cp3|G`Xvo7b2ejKCCsZb|)Ns>8u=nz<2wxP-vaFz64cmMw)x8 zja28Oo+JNs8QnywhY+vYz?SAH5Jxrs2j#~wP<~^@3llxr=l4NR-XDJ%nZ43#tBkj5 z1V>&AavOVHcvh)f);*?5%c;VEPM}^ZTZ>p5R>W;MUNkH^tM~W83#cZ!zq%8C{blnq zx-(I&gx^;0%HU@5csO);4kx_R{mpvabx?m)x9tjv%(JQO@I!rTt1eFk(2V8283s2G z&kqV)i;|SCN1_n{<{tyvS5AtR%F0)&8I2NrIC5>m`z_6{N&9rl zz9pW8&4Sc-VEupQcdAa?Uyif|HTnLCCtuMSv;6(B0;ps@rAeM!fA9vMdGB|)^@_iW zZdwZH?pfa-#3C@9mzn5S25k$}hL9X<^tpv_htVQwGNMiQqjiXSP$RbZtF!ufNYIt zBOBpskFq?l)O5|wG*^4_kGdxgjCcT%eb&@X;`?}aj}_1$yrY0`P2R?%is0!jLujRj z%6$ab;9zwYDf_-&Rb6;meElqF{2Ds~E~jh78e!9+`q9 zPa1tb(~P|4d=?)HtP-vJ4ubIcgXRv+<$`Ni(1rBWFBYTh?s%0`GAR$4^p)%3c{|KE z2g>^`MVAccBESRCh0<=qBEATA^D*?GyoNM6FU0|jp*95^)fnCG$>X3M#x=K~tJ}1q zN_B#JMEWZme)wZG8tW6lo;ddPOn+*`;du)|Q--DAeLV9bHu>FZ3-yl6ta*A6zQ@{A z0=IO2uhQ@GKk#B3c+e;pT!d{v5>@LV)E62CJD7S2)m_oGV$*NG4aTLobnSmj1(Z(r zIoBud)O9ZQT9a8Q7ma`8toY5~mh4!3KgSFQ;16fDwJV{TTe1qEX%fi`ts*R> zt%l%SAL&jB3fpCS&o@#}Tin_H?bPIJux*KBk~xLPD*PtM8XcF_q_KilezLUis==6ekC$jJ|1lG}=yv5XkgNkGfg@~y4?)3ZYG zk_At^qe(mPjZn(sN`OZb9bCM@;Lsk&yt?CRsG21zngGi z1Cp2a9Cf6M3YOa$Fzt7x4?$Bej!c7An!4SG_{||0xB2I1D{zl~%Wd&BY-chK3zMWh zytHRizoYZCgh}?=yLaomLygaLuVyH^zu%u^8~Z*gq`yG77xG_2KPI-&Sk*PZp+CBE zn0hRWEi`bx_ZhMxq?-N(>s38OxZ=yw#1vsqCXennY=booi!ya9e`)#79L^_C>M z|A9~*6Zhq|uCLm@U7R@K_ylv zo?)kVMrqv6MUkXm^skU&PI>zD_nD)VomPPC(9!|h*eRo3W%uEF8ueDS-Tu6^1GOS5;p;#A*YPX< zjTmfs62#wvHzygc0wATuHjUgR6H+@82pj4%n_N0lBq;_VN6 z=r5)3dTdF(7R29Zzf1^KB(FqRevAkUW&x$QZH>UY^AJy<%ox=v9x`xK; zLdPUGcOY@8Qpegm4pvqv7CeB0m9fcugY!Xk&ub|Q2GeM0Bs=ek5}k3x@{j_hfxGW; zZ+NYOr!JZB#8uDP`Sq;tn}IuZfa&|U0vjLYC&Cwg$uEUTfgi_G@XfgW_Me#9yz1y` z+1H_#5v^`I!qCVvzmeWRu_n|c6M-CDJ=Gbnz+jXwGh_Uh7&bx%g+||%IU2`CTpYF6 z(O5W|-pd9{XdF(*$BBEj2KO# z+WrXv9#;$HCm5#qT0Ms~j0SsbMuyXso|GaH2IU`eWr7jokabI73XW6eUm_$?rxY9C*R|DPeLuWuiQMpVnws#^=PvVKC{@+ zm@5gLUF&t3YAY>f{zv)?3-*?)dc9|}P(M*m4#nl@(E{>MT^%th_422OeLs^%3TtW5 z670La5J@$DZhxv%4`y81($#A9XYFA2E*m9z-Y66H#$tw;xxexS-I}lvwL4|4mX5}!Tl6>9giJr8yi-|Jl%4fTl-7tL zYL$evr>I1PggVNLc+_#aiLiQ3nI5hAUuJ_OB!b{CiwEgZc8e6$dl~=y4GK={FC*8J zAHsL4gtk60;*-Tl7= zq)=$B!U7RT7NR4|VKjs^7C@>EEZzpS;H@5)yw;-72Wj?OG)f&DW}p_Y5Uw$Jo~&mo z%s;A}4EGmOqkTk(8u2bFq8vzNT8{fjwQJnTnQ{YWkU5ov{3@?}gk{A72~-0$Wd?Gw z8>lWLF4K-$0xrWc&X6L+PkGcRqdaoNq0ezcmS{i0_r{f~D>+hgZ5Te0lpr;>R=6m2 zNib%bPwc1g( zyf4GNqwJa~sP@tyG^gR-QE(ftKu7e4cOyfkTZ4#7%!(p!x!GIyP8D6Mbt)4h=-JaBL}8hplnwr<>^~2RSP+e*x1+)zsqjF$QeIDFaT&l zVc{{k7#Z=#E!F759HIEEubW*RsCx3*)rE(NhszLqkD2Nqxhhy;&3*IOi((Hv;e_;4-W3bW~)ut)Q>>YgY#^~FZ zOebolnBigaj}&^O0O29N<#6@e%l21T0f?_cxAuR%PM0Gl#U58tpzjm%;7)yYmF29@ zo_w+B9_m9=;qHe}N0q?d*Q1Ct;faBEb85{D{~dpXkM-D?m`;%yveV6ri zcCFI>p4jY@tNyyeoLKLOW%()8<|yG1<^PUMR*b{8t;g&lXZ8BRoJxma?$W<)MwXt- zvfj6i8A)i?$EL8=F;9 zy#vYKmCoUs#DCP}e?)M;Sg-H2-axk3`FW0iEo*VvfGo4<>DP0CQQpZeZX{OK)FZH; zTBXF!;gW?-4~7VA81H^_K39V--L?*+WMY&K78oC~hIR$ICXpmYc)VF##kG$CDX28K zjA%CJjYHj&r@;&qVLH?{(9uf zP$M=2?ZME=lG2(q`C^*2Az?I~cw~+t4@Ftj6z0asjlZjll%OLIJ|3%{JX(7;8h`=RH;ay|r!==;L{A zfRCrlnV9(WTG?xCJ0BQ67SXIJp{W+CAk8gw9+%-1nh&ReM|*9USP%`Mi`wsmQ>Pg( ze-Xq4y8kTt%ysHqnmbbEdq$=q9!;;9g-au|S*)nWu5tJ z*j$aRAsVr9d!ROkHm%Gkhj4BoDJ>bX^S5s~`VW>DDH>l zD`k)f55q)zy*E3S9S*RpOM@KAS^bXd2(0Sf$SyWX373_=&Ot&j8QS_POE2Vn!Px@7 z+8I}8J;RAG;q-J{(Qqav$Uc7GWiR#d0Ty}C&a7Is#Baydlz;RRr3R2;1vE^ovYzPn zzJB`!d0cFh5F;UbVamWglozyWu1=a?T^(mOm^2Zb5xU#>@f5JuD8fCugf{04I`0LkVw9`o8RS}#+6G>vTO76d)b4jvqbiL?G_YX>jI;zzpYr9Fuvnu&6VYe{!Zb6BE-^NT zDxSc2&AZReE2q19ti9dCpB^4|h6D%7P!#fgpFstD2BjR8Y#HqmD#`DE?ye8g={NSJ zdfs=ft9;Z)viDNluSuSF>wGO1r>?VA7BINWsLAfDOoDc>?&~Ob+!qDo#(OzYcTY z1#0z9ODQ5vZBV5@q2r+RKC4?j4Vo6~2n`J-yzDDhmgwRX{GNIsC;MNtj}VP=f`D1$ zgQ8-nbnCw=fHlV;mv|0?!nX(9-@{@j-hRt-`0K99Ido2wbF21G&MU)t{qB?45U>Y> zM zYo~as^uRu$>Jc)XEWAX;fg_3^s{Y4lUi|OxsQB1@(R_kFk(w5Nk!dN54iH zbB~rF3#ab28TslUC&FdtA!NjGzxcOiNr!A+Zlu@b=K2SQT6H-omCqG#_kF^U4I03i z^mInNi;XR|Nq|(AFnG+yHkAa4&t(BrDAbmb7&j@iIr$D4UVJguAaYKo2XVhr8)-M1>TwOls`)uf?Z0&sn22UZl-ga^ zt>avsX2Cv777hin52i`MVkADwfJ>7GT#mr$H&l7wqJ#05sydacnRF7(wAE}0#^cIq zOj3$%DkXDm;(V4Em2apAISc+%h7qDA1gD9(J29{R!spmk$5X;(^o&fr>5)yNRU=x$ z>OYIJa~J#^PCz<3UXGuuY^^3Sh0Mnp7+MJ7WNP5l8(ewMXPiO@tswjGMBDBB)i|7% zt^S-EhB&>mc%I~6M}_%%-&qB7HYpuOs6K~aS2M)2djjE~;R(}+;zSg*kZhNR5Was7 zqrOFR=SM@-#cZ3KA=C*&L&g_`$KT^g`Hl(sXd%-b=+p$H0PEFn9cEWE3fEGy&`mkI z=PORU#L16YU@U;>h&|dDtFeke?H;Y02s5(P0OeWEt>O=tv(AS?#Q zSb1HEau^!fI6PyE#{#A@jr=kP^$8W;!avHG<8%5E-H}UH@KCvOKOQ(1B_Y6iM+d~k z1W-v(=>aeN`4ug~#Zzb-Y-W`>Rx9Q6?9UR4oXjK2n^3o6QkR$G5vQX844617*hdiq zuVGX^)}7cC){Fraxmdcq8mwfhNY>g~BH1e3olRt~G)!-2!*~37&cjUqOt_K-%$4Yx zOHmJ-xfL=or!!7lRGKa!l{ZGb0Bw$16qY=|S}uwK&vPV|_6RixhaGG|0WXWBHQW|^ z0ZGJLjmQa?lYgMoPV{K@Y;nAQ-`KpTQtJq$0Kk(0YJ6**w(s*G7ZErNa(#iIcs%Bc zkARD+*mg4$I%8dHS;ZODXknFj@IV^7D~eX17*Q4hnJV6q0;wmJ)iDB z!P9jZ2@vl%MP2^382-}!r!AWiM)Hjp3Zem^DH&dhJ|~N97S6aZ-|>v+b&2(LG^>1c zU~3emKuzm>Y}Y^1v;D2?n0C`SY~@sRnZi+9@@^+;|E%~==2Vm$Mi$CLd(ZwwZvFtj{O(l1vWbgqdMd6>=3} zdqo?*pb^EZC<;;40ITH&S2NjOsUz`vS^xus0cho>5$SV{I&PsOuoTp{A z7&ys1Qn35i(4c%5%t$7^k!b88BIN+864rN!$KQ#)Z^}w`J=2H~a3}vt{y(VE7@ab>n-w!4 z_@md1h=}CEynI$i{S-h855j6eI5$K?fDbFUfgClZKhp2Hf-VOBWtL=8eGGmJ5H9(? zaqC-{$yHQZj`EQL<7Fzf&^PFaD^AKT6es=SM|MSYnRGm-(IeXY{t|f+YJUhW0}Lw6 zh>aMs0DjtOAa!({Cz9RmDm+$Ti*WKMs*t17hz#_%S<1n3Pr0a++rp+@rIjO|&NV!m zqeI+2l|-uB4SinI5)~jgii%wLot8Q5*6ZbH+1M(c@P^OSxhR2 zBiu9!WmtjoV*@rQs}YvS+xI?QDgbm&rTm}eDvPq=AV<&F8gSeK3t{8PeW}lj_^H}g zC2P;4Awf(V>9o+d+aqfp!69ghm(oNL#g2FR$MlidTibYvvXoe*RY`5yXg%qGm}-Ru za`tM*OsoG{Lp=iev^oK7)tia52-6u@-9(wcw{#L!wSJQ24UK0-; zp~O&%ie`0ajW6wHL#A zGBou0Gaw6~pm7(q@~Si%3Y7ts=0!vY_CgW%+dKC*B) z#ptQi*YO2ibr4eI;NQim%-vNq{T-H+zphVAVfFCv zIHao9T$3lsmk6^y?V$numaS;OG@X$UoueOgq*l&biLBfi2Oc$WgohqPCX2U9fX8#& z6TrHc?8idOd@7%e7hZu!<9hgz)q2<;e~d~_;2{fCiTK3kS{;AKDFvy{Z2fF1h?U%brXnxa6$yU}IHU|0?TagpPhoNB0%DE$FdgdIykk8bz!} zEP>eModrIh5=Zmufyf$TVnGbiHME&1D4v z7a}UkXWtW%j;7@_E|gXysqYj`L=cgJD7ajj4|IZ89Z`ZDjaD$oj=ZkN&hPM7DD(?WOwQ)6T1K!v#sF|=qAsB2B12(=8t*ZuL&ja!E# z#fCOUv!)XfzcgR!z-CPO+!csDdQ$Tv8{jTMESD8hbyKILLc?#)8Ar$k1dCx{Q;kwx z&?-ZEY?)Et#t>lY^AZz>gx2TS{6l=^juQr%sXO?VHp2RNxo13}mnLbJU38nNS5;d6Bo~Hh}~}$zj5@TpHHQ zG_efz?~yf+eu($sQ@77kiUJtmpOEc(Z z?m+I`@u&Xpyi$BWE~KfIPhz&G8G5ba!5r>2g($xIDNu=A8&e?|mCULFg2}2bGIlYt zCu6$;RW(Ov7oCD5wI13W`r+RAxHE{XRnMqMCh{G0m151MbTr6m*1Q=3#EY+b(a@A_ zq+XUyV?0hpRuN5pL`x-eBpq)bh>M6ofjnWx04-}(jlMvHSl$}MWP`57VY>gwTEI@6bHwM+nojw8O7kF ziw6&kRNOrSa!r=B4{bRHX=LnWx0^uS1c5yoDz5^f-Xj29u!3vGEG=JN7y@(>+5+7s z$}8;PSw3{%2+<^RX0|K416lD*Q^9Znxd!EJoR;YbVB$^hKp0C~#e1hAo;CH4VQ6GT zS(Q6)KMiv^t24#eI|V`DiS9{Tp|dJoLS?A`a=hZ4^4^o%Xwn; zCUv>9k{fS&qB;A4i8G>?8%yQbiN-b_LdMa{9u3+r>qrKS$1=*j=}Qz`{VeZ+Aj9+) zzwWBKtA{?y|3fA-m~nfYJgaN0y39sZ=+)A;;`ztSx1+Ebd(+tO$fAUVIpZdszxt(= zd?yg%sGgG1KjT#O(secaAD>+;AM2cr81wVLMjLp>)StqaR&dMJ7eCS%=Nk&L!4g73 zUw6#)15=$C;Z9!kx>pg5hPTK%+e*DZ=sc>wJu+_LB&3%utgO|HaiLhe#5sk@wqJS39kv8U0p}j2-FN&+wPiSRLPYzw$yci zXF>9H%MEJ1T7ofaKe zVCnZxh6T?4dop%iF{(no?j7zvL*u+hZ3eQ}~hY3?5TFz9d5igBhVU~>gr)-W>S zIUx*P%vnr=hc-3LSF9PgR4u*X38I*v@6F5E^9#QEcRrpL!QI57O=do7ARykTA0t8T zw<31N@zM4n+t7ziXvA&^{=a;z@x$x)E0G9fK$T&Vq8T75l#T3?G9dnoW+MZ+gcHX=*00-R)!rI9kCf#aeqyj?$xj4@aL)_ZhzHg#w77kA1C18@{MX(g!=iBG=& E2aqv@kpKVy From 876773b4f263f28d2f15fd44cd3a964a4722c795 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 13 Jan 2025 14:33:26 +0100 Subject: [PATCH 051/102] Revert "[MachineCP] Correctly handle register masks and sub-registers (#122472)" This reverts commit e2a071ece58790f8dd4886e998033cab82e906fb. This causes a large compile-time regression. --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 136 ++++++++---------- .../CodeGen/AArch64/machine-cp-sub-reg.mir | 32 +---- 2 files changed, 57 insertions(+), 111 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index d2579e2d1b44c..49ce4b660c3ae 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -164,91 +164,67 @@ class CopyTracker { Copies.erase(Unit); } - /// Clobber a single register unit, removing it from the tracker's copy maps. - void clobberRegUnit(MCRegUnit Unit, const TargetRegisterInfo &TRI, - const TargetInstrInfo &TII, bool UseCopyInstr) { - auto I = Copies.find(Unit); - if (I != Copies.end()) { - // When we clobber the source of a copy, we need to clobber everything - // it defined. - markRegsUnavailable(I->second.DefRegs, TRI); - // When we clobber the destination of a copy, we need to clobber the - // whole register it defined. - if (MachineInstr *MI = I->second.MI) { - std::optional CopyOperands = - isCopyInstr(*MI, TII, UseCopyInstr); - - MCRegister Def = CopyOperands->Destination->getReg().asMCReg(); - MCRegister Src = CopyOperands->Source->getReg().asMCReg(); - - markRegsUnavailable(Def, TRI); - - // Since we clobber the destination of a copy, the semantic of Src's - // "DefRegs" to contain Def is no longer effectual. We will also need - // to remove the record from the copy maps that indicates Src defined - // Def. Failing to do so might cause the target to miss some - // opportunities to further eliminate redundant copy instructions. - // Consider the following sequence during the - // ForwardCopyPropagateBlock procedure: - // L1: r0 = COPY r9 <- TrackMI - // L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker) - // L3: use r0 <- Remove L2 from MaybeDeadCopies - // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker) - // L5: r0 = COPY r8 <- Remove NopCopy - for (MCRegUnit SrcUnit : TRI.regunits(Src)) { - auto SrcCopy = Copies.find(SrcUnit); - if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) { - // If SrcCopy defines multiple values, we only need - // to erase the record for Def in DefRegs. - for (auto itr = SrcCopy->second.DefRegs.begin(); - itr != SrcCopy->second.DefRegs.end(); itr++) { - if (*itr == Def) { - SrcCopy->second.DefRegs.erase(itr); - // If DefReg becomes empty after removal, we can remove the - // SrcCopy from the tracker's copy maps. We only remove those - // entries solely record the Def is defined by Src. If an - // entry also contains the definition record of other Def' - // registers, it cannot be cleared. - if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) { - Copies.erase(SrcCopy); + /// Clobber a single register, removing it from the tracker's copy maps. + void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII, bool UseCopyInstr) { + for (MCRegUnit Unit : TRI.regunits(Reg)) { + auto I = Copies.find(Unit); + if (I != Copies.end()) { + // When we clobber the source of a copy, we need to clobber everything + // it defined. + markRegsUnavailable(I->second.DefRegs, TRI); + // When we clobber the destination of a copy, we need to clobber the + // whole register it defined. + if (MachineInstr *MI = I->second.MI) { + std::optional CopyOperands = + isCopyInstr(*MI, TII, UseCopyInstr); + + MCRegister Def = CopyOperands->Destination->getReg().asMCReg(); + MCRegister Src = CopyOperands->Source->getReg().asMCReg(); + + markRegsUnavailable(Def, TRI); + + // Since we clobber the destination of a copy, the semantic of Src's + // "DefRegs" to contain Def is no longer effectual. We will also need + // to remove the record from the copy maps that indicates Src defined + // Def. Failing to do so might cause the target to miss some + // opportunities to further eliminate redundant copy instructions. + // Consider the following sequence during the + // ForwardCopyPropagateBlock procedure: + // L1: r0 = COPY r9 <- TrackMI + // L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker) + // L3: use r0 <- Remove L2 from MaybeDeadCopies + // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker) + // L5: r0 = COPY r8 <- Remove NopCopy + for (MCRegUnit SrcUnit : TRI.regunits(Src)) { + auto SrcCopy = Copies.find(SrcUnit); + if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) { + // If SrcCopy defines multiple values, we only need + // to erase the record for Def in DefRegs. + for (auto itr = SrcCopy->second.DefRegs.begin(); + itr != SrcCopy->second.DefRegs.end(); itr++) { + if (*itr == Def) { + SrcCopy->second.DefRegs.erase(itr); + // If DefReg becomes empty after removal, we can remove the + // SrcCopy from the tracker's copy maps. We only remove those + // entries solely record the Def is defined by Src. If an + // entry also contains the definition record of other Def' + // registers, it cannot be cleared. + if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) { + Copies.erase(SrcCopy); + } + break; } - break; } } } } + // Now we can erase the copy. + Copies.erase(I); } - // Now we can erase the copy. - Copies.erase(I); } } - /// Clobber a single register, removing it from the tracker's copy maps. - void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI, - const TargetInstrInfo &TII, bool UseCopyInstr) { - for (MCRegUnit Unit : TRI.regunits(Reg)) { - clobberRegUnit(Unit, TRI, TII, UseCopyInstr); - } - } - - /// Clobber all registers which are not preserved by RegMask, removing them - /// from the tracker's copy maps. - void clobberRegistersExceptMask(const MachineOperand *RegMask, - const TargetRegisterInfo &TRI, - const TargetInstrInfo &TII, - bool UseCopyInstr) { - BitVector SafeRegUnits(TRI.getNumRegUnits()); - - for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg) - if (!RegMask->clobbersPhysReg(SafeReg)) - for (auto SafeUnit : TRI.regunits(SafeReg)) - SafeRegUnits.set(SafeUnit); - - for (unsigned Unit = 0, E = TRI.getNumRegUnits(); Unit < E; ++Unit) - if (!SafeRegUnits.test(Unit)) - clobberRegUnit(Unit, TRI, TII, UseCopyInstr); - } - /// Track copy's src users, and return false if that can't be done. /// We can only track if we have a COPY instruction which source is /// the same as the Reg. @@ -984,10 +960,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { // a large set of registers. Treat clobbered registers the same way as // defined registers. if (RegMask) { - // Invalidate all entries in the copy map which are not preserved by this - // register mask. - Tracker.clobberRegistersExceptMask(RegMask, *TRI, *TII, UseCopyInstr); - // Erase any MaybeDeadCopies whose destination register is clobbered. for (SmallSetVector::iterator DI = MaybeDeadCopies.begin(); @@ -1006,6 +978,10 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: "; MaybeDead->dump()); + // Make sure we invalidate any entries in the copy maps before erasing + // the instruction. + Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr); + // erase() will return the next valid iterator pointing to the next // element after the erased one. DI = MaybeDeadCopies.erase(DI); diff --git a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir index e7865569c75bd..5b379c2bd5629 100644 --- a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir +++ b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir @@ -1,16 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos | FileCheck %s - ---- | - declare void @foo() - - define void @test() { - unreachable - } - define void @test2() { - unreachable - } -... +# RUN: llc -o - %s --run-pass=machine-cp -mcp-use-is-copy-instr -mtriple=arm64-apple-macos --verify-machineinstrs | FileCheck %s --- name: test @@ -41,22 +30,3 @@ body: | RET undef $lr, implicit $x0 ... ---- -name: test2 -tracksRegLiveness: true -body: | - bb.0: - liveins: $q14, $d29, $x0, $x1 - ; CHECK-LABEL: name: test2 - ; CHECK: liveins: $q14, $d29, $x0, $x1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $d8 = COPY killed renamable $d29 - ; CHECK-NEXT: BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp - ; CHECK-NEXT: renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0 - ; CHECK-NEXT: RET_ReallyLR implicit $b0 - renamable $q8 = COPY renamable $q14 - renamable $d8 = COPY killed renamable $d29 - BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp - renamable $b0 = SMAXVv8i8v killed renamable $d8, implicit-def $q0 - RET_ReallyLR implicit $b0 -... From 351527576d0b66f5a2b62db1c879e33daec9d37c Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 13 Jan 2025 14:40:25 +0100 Subject: [PATCH 052/102] [IR] Introduce captures attribute (#116990) This introduces the `captures` attribute as described in: https://discourse.llvm.org/t/rfc-improvements-to-capture-tracking/81420 This initial patch only introduces the IR/bitcode support for the attribute and its in-memory representation as `CaptureInfo`. This will be followed by a patch to upgrade and remove the `nocapture` attribute, and then by actual inference/analysis support. Based on the RFC feedback, I've used a syntax similar to the `memory` attribute, though the only "location" that can be specified is `ret`. I've added some pretty extensive documentation to LangRef on the semantics. One non-obvious bit here is that using ptrtoint will not result in a "return-only" capture, even if the ptrtoint result is only used in the return value. Without this requirement we wouldn't be able to continue ordinary capture analysis on the return value. --- llvm/docs/LangRef.rst | 136 ++++++++++++++++++-- llvm/include/llvm/AsmParser/LLParser.h | 1 + llvm/include/llvm/AsmParser/LLToken.h | 6 + llvm/include/llvm/Bitcode/LLVMBitCodes.h | 1 + llvm/include/llvm/IR/Attributes.h | 7 + llvm/include/llvm/IR/Attributes.td | 3 + llvm/include/llvm/Support/ModRef.h | 101 +++++++++++++++ llvm/lib/AsmParser/LLLexer.cpp | 4 + llvm/lib/AsmParser/LLParser.cpp | 61 +++++++++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 4 + llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 + llvm/lib/IR/AttributeImpl.h | 1 + llvm/lib/IR/Attributes.cpp | 34 ++++- llvm/lib/Support/ModRef.cpp | 34 +++++ llvm/lib/Transforms/Utils/CodeExtractor.cpp | 1 + llvm/test/Assembler/captures-errors.ll | 73 +++++++++++ llvm/test/Assembler/captures.ll | 103 +++++++++++++++ llvm/test/Bitcode/attributes.ll | 5 + llvm/unittests/IR/AttributesTest.cpp | 13 ++ 19 files changed, 580 insertions(+), 10 deletions(-) create mode 100644 llvm/test/Assembler/captures-errors.ll create mode 100644 llvm/test/Assembler/captures.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 33acb5e73d5ff..8cc9036d1b67f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -1397,6 +1397,42 @@ Currently, only the following parameter attributes are defined: function, returning a pointer to allocated storage disjoint from the storage for any other object accessible to the caller. +``captures(...)`` + This attributes restrict the ways in which the callee may capture the + pointer. This is not a valid attribute for return values. This attribute + applies only to the particular copy of the pointer passed in this argument. + + The arguments of ``captures`` is a list of captured pointer components, + which may be ``none``, or a combination of: + + - ``address``: The integral address of the pointer. + - ``address_is_null`` (subset of ``address``): Whether the address is null. + - ``provenance``: The ability to access the pointer for both read and write + after the function returns. + - ``read_provenance`` (subset of ``provenance``): The ability to access the + pointer only for reads after the function returns. + + Additionally, it is possible to specify that some components are only + captured in certain locations. Currently only the return value (``ret``) + and other (default) locations are supported. + + The `pointer capture section ` discusses these semantics + in more detail. + + Some examples of how to use the attribute: + + - ``captures(none)``: Pointer not captured. + - ``captures(address, provenance)``: Equivalent to omitting the attribute. + - ``captures(address)``: Address may be captured, but not provenance. + - ``captures(address_is_null)``: Only captures whether the address is null. + - ``captures(address, read_provenance)``: Both address and provenance + captured, but only for read-only access. + - ``captures(ret: address, provenance)``: Pointer captured through return + value only. + - ``captures(address_is_null, ret: address, provenance)``: The whole pointer + is captured through the return value, and additionally whether the pointer + is null is captured in some other way. + .. _nocapture: ``nocapture`` @@ -3339,10 +3375,92 @@ Pointer Capture --------------- Given a function call and a pointer that is passed as an argument or stored in -the memory before the call, a pointer is *captured* by the call if it makes a -copy of any part of the pointer that outlives the call. -To be precise, a pointer is captured if one or more of the following conditions -hold: +memory before the call, the call may capture two components of the pointer: + + * The address of the pointer, which is its integral value. This also includes + parts of the address or any information about the address, including the + fact that it does not equal one specific value. We further distinguish + whether only the fact that the address is/isn't null is captured. + * The provenance of the pointer, which is the ability to perform memory + accesses through the pointer, in the sense of the :ref:`pointer aliasing + rules `. We further distinguish whether only read acceses + are allowed, or both reads and writes. + +For example, the following function captures the address of ``%a``, because +it is compared to a pointer, leaking information about the identitiy of the +pointer: + +.. code-block:: llvm + + @glb = global i8 0 + + define i1 @f(ptr %a) { + %c = icmp eq ptr %a, @glb + ret i1 %c + } + +The function does not capture the provenance of the pointer, because the +``icmp`` instruction only operates on the pointer address. The following +function captures both the address and provenance of the pointer, as both +may be read from ``@glb`` after the function returns: + +.. code-block:: llvm + + @glb = global ptr null + + define void @f(ptr %a) { + store ptr %a, ptr @glb + ret void + } + +The following function captures *neither* the address nor the provenance of +the pointer: + +.. code-block:: llvm + + define i32 @f(ptr %a) { + %v = load i32, ptr %a + ret i32 + } + +While address capture includes uses of the address within the body of the +function, provenance capture refers exclusively to the ability to perform +accesses *after* the function returns. Memory accesses within the function +itself are not considered pointer captures. + +We can further say that the capture only occurs through a specific location. +In the following example, the pointer (both address and provenance) is captured +through the return value only: + +.. code-block:: llvm + + define ptr @f(ptr %a) { + %gep = getelementptr i8, ptr %a, i64 4 + ret ptr %gep + } + +However, we always consider direct inspection of the pointer address +(e.g. using ``ptrtoint``) to be location-independent. The following example +is *not* considered a return-only capture, even though the ``ptrtoint`` +ultimately only contribues to the return value: + +.. code-block:: llvm + + @lookup = constant [4 x i8] [i8 0, i8 1, i8 2, i8 3] + + define ptr @f(ptr %a) { + %a.addr = ptrtoint ptr %a to i64 + %mask = and i64 %a.addr, 3 + %gep = getelementptr i8, ptr @lookup, i64 %mask + ret ptr %gep + } + +This definition is chosen to allow capture analysis to continue with the return +value in the usual fashion. + +The following describes possible ways to capture a pointer in more detail, +where unqualified uses of the word "capture" refer to capturing both address +and provenance. 1. The call stores any bit of the pointer carrying information into a place, and the stored bits can be read from the place by the caller after this call @@ -3381,13 +3499,14 @@ hold: @lock = global i1 true define void @f(ptr %a) { - store ptr %a, ptr* @glb + store ptr %a, ptr @glb store atomic i1 false, ptr @lock release ; %a is captured because another thread can safely read @glb store ptr null, ptr @glb ret void } -3. The call's behavior depends on any bit of the pointer carrying information. +3. The call's behavior depends on any bit of the pointer carrying information + (address capture only). .. code-block:: llvm @@ -3395,7 +3514,7 @@ hold: define void @f(ptr %a) { %c = icmp eq ptr %a, @glb - br i1 %c, label %BB_EXIT, label %BB_CONTINUE ; escapes %a + br i1 %c, label %BB_EXIT, label %BB_CONTINUE ; captures address of %a only BB_EXIT: call void @exit() unreachable @@ -3403,8 +3522,7 @@ hold: ret void } -4. The pointer is used in a volatile access as its address. - +4. The pointer is used as the pointer operand of a volatile access. .. _volatile: diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h index 8b195b028783f..c01de4a289a69 100644 --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -379,6 +379,7 @@ namespace llvm { bool inAttrGrp, LocTy &BuiltinLoc); bool parseRangeAttr(AttrBuilder &B); bool parseInitializesAttr(AttrBuilder &B); + bool parseCapturesAttr(AttrBuilder &B); bool parseRequiredTypeAttr(AttrBuilder &B, lltok::Kind AttrToken, Attribute::AttrKind AttrKind); diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index 178c911120b4c..7b47bc88ddb25 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -207,6 +207,12 @@ enum Kind { kw_inaccessiblememonly, kw_inaccessiblemem_or_argmemonly, + // Captures attribute: + kw_address, + kw_address_is_null, + kw_provenance, + kw_read_provenance, + // nofpclass attribute: kw_all, kw_nan, diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 21fd27d9838db..9eb38c3e44829 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -788,6 +788,7 @@ enum AttributeKindCodes { ATTR_KIND_NO_EXT = 99, ATTR_KIND_NO_DIVERGENCE_SOURCE = 100, ATTR_KIND_SANITIZE_TYPE = 101, + ATTR_KIND_CAPTURES = 102, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 2755ced404ddd..7612e553fe32e 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -284,6 +284,9 @@ class Attribute { /// Returns memory effects. MemoryEffects getMemoryEffects() const; + /// Returns information from captures attribute. + CaptureInfo getCaptureInfo() const; + /// Return the FPClassTest for nofpclass FPClassTest getNoFPClass() const; @@ -436,6 +439,7 @@ class AttributeSet { UWTableKind getUWTableKind() const; AllocFnKind getAllocKind() const; MemoryEffects getMemoryEffects() const; + CaptureInfo getCaptureInfo() const; FPClassTest getNoFPClass() const; std::string getAsString(bool InAttrGrp = false) const; @@ -1260,6 +1264,9 @@ class AttrBuilder { /// Add memory effect attribute. AttrBuilder &addMemoryAttr(MemoryEffects ME); + /// Add captures attribute. + AttrBuilder &addCapturesAttr(CaptureInfo CI); + // Add nofpclass attribute AttrBuilder &addNoFPClassAttr(FPClassTest NoFPClassMask); diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 61955cf883c3f..4396ec4d04c41 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -183,6 +183,9 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>; /// Function creates no aliases of pointer. def NoCapture : EnumAttr<"nocapture", IntersectAnd, [ParamAttr]>; +/// Specify how the pointer may be captured. +def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>; + /// Function is not a source of divergence. def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>; diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 5a9d80c87ae27..9ecdab71ec8ca 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -273,6 +273,107 @@ raw_ostream &operator<<(raw_ostream &OS, MemoryEffects RMRB); // Legacy alias. using FunctionModRefBehavior = MemoryEffects; +/// Components of the pointer that may be captured. +enum class CaptureComponents : uint8_t { + None = 0, + AddressIsNull = (1 << 0), + Address = (1 << 1) | AddressIsNull, + ReadProvenance = (1 << 2), + Provenance = (1 << 3) | ReadProvenance, + All = Address | Provenance, + LLVM_MARK_AS_BITMASK_ENUM(Provenance), +}; + +inline bool capturesNothing(CaptureComponents CC) { + return CC == CaptureComponents::None; +} + +inline bool capturesAnything(CaptureComponents CC) { + return CC != CaptureComponents::None; +} + +inline bool capturesAddressIsNullOnly(CaptureComponents CC) { + return (CC & CaptureComponents::Address) == CaptureComponents::AddressIsNull; +} + +inline bool capturesAddress(CaptureComponents CC) { + return (CC & CaptureComponents::Address) != CaptureComponents::None; +} + +inline bool capturesReadProvenanceOnly(CaptureComponents CC) { + return (CC & CaptureComponents::Provenance) == + CaptureComponents::ReadProvenance; +} + +inline bool capturesFullProvenance(CaptureComponents CC) { + return (CC & CaptureComponents::Provenance) == CaptureComponents::Provenance; +} + +raw_ostream &operator<<(raw_ostream &OS, CaptureComponents CC); + +/// Represents which components of the pointer may be captured in which +/// location. This represents the captures(...) attribute in IR. +/// +/// For more information on the precise semantics see LangRef. +class CaptureInfo { + CaptureComponents OtherComponents; + CaptureComponents RetComponents; + +public: + CaptureInfo(CaptureComponents OtherComponents, + CaptureComponents RetComponents) + : OtherComponents(OtherComponents), RetComponents(RetComponents) {} + + CaptureInfo(CaptureComponents Components) + : OtherComponents(Components), RetComponents(Components) {} + + /// Create CaptureInfo that may capture all components of the pointer. + static CaptureInfo all() { return CaptureInfo(CaptureComponents::All); } + + /// Get components potentially captured by the return value. + CaptureComponents getRetComponents() const { return RetComponents; } + + /// Get components potentially captured through locations other than the + /// return value. + CaptureComponents getOtherComponents() const { return OtherComponents; } + + /// Get the potentially captured components of the pointer (regardless of + /// location). + operator CaptureComponents() const { return OtherComponents | RetComponents; } + + bool operator==(CaptureInfo Other) const { + return OtherComponents == Other.OtherComponents && + RetComponents == Other.RetComponents; + } + + bool operator!=(CaptureInfo Other) const { return !(*this == Other); } + + /// Compute union of CaptureInfos. + CaptureInfo operator|(CaptureInfo Other) const { + return CaptureInfo(OtherComponents | Other.OtherComponents, + RetComponents | Other.RetComponents); + } + + /// Compute intersection of CaptureInfos. + CaptureInfo operator&(CaptureInfo Other) const { + return CaptureInfo(OtherComponents & Other.OtherComponents, + RetComponents & Other.RetComponents); + } + + static CaptureInfo createFromIntValue(uint32_t Data) { + return CaptureInfo(CaptureComponents(Data >> 4), + CaptureComponents(Data & 0xf)); + } + + /// Convert CaptureInfo into an encoded integer value (used by captures + /// attribute). + uint32_t toIntValue() const { + return (uint32_t(OtherComponents) << 4) | uint32_t(RetComponents); + } +}; + +raw_ostream &operator<<(raw_ostream &OS, CaptureInfo Info); + } // namespace llvm #endif diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 1b8e033134f51..5ea507c009bdc 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -704,6 +704,10 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(argmemonly); KEYWORD(inaccessiblememonly); KEYWORD(inaccessiblemem_or_argmemonly); + KEYWORD(address_is_null); + KEYWORD(address); + KEYWORD(provenance); + KEYWORD(read_provenance); // nofpclass attribute KEYWORD(all); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 52d48a69f0eb5..81d048b32e139 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -1644,6 +1644,8 @@ bool LLParser::parseEnumAttribute(Attribute::AttrKind Attr, AttrBuilder &B, return parseRangeAttr(B); case Attribute::Initializes: return parseInitializesAttr(B); + case Attribute::Captures: + return parseCapturesAttr(B); default: B.addAttribute(Attr); Lex.Lex(); @@ -3165,6 +3167,65 @@ bool LLParser::parseInitializesAttr(AttrBuilder &B) { return false; } +bool LLParser::parseCapturesAttr(AttrBuilder &B) { + CaptureComponents Other = CaptureComponents::None; + std::optional Ret; + + // We use syntax like captures(ret: address, provenance), so the colon + // should not be interpreted as a label terminator. + Lex.setIgnoreColonInIdentifiers(true); + auto _ = make_scope_exit([&] { Lex.setIgnoreColonInIdentifiers(false); }); + + Lex.Lex(); + if (parseToken(lltok::lparen, "expected '('")) + return true; + + CaptureComponents *Current = &Other; + bool SeenComponent = false; + while (true) { + if (EatIfPresent(lltok::kw_ret)) { + if (parseToken(lltok::colon, "expected ':'")) + return true; + if (Ret) + return tokError("duplicate 'ret' location"); + Ret = CaptureComponents::None; + Current = &*Ret; + SeenComponent = false; + } + + if (EatIfPresent(lltok::kw_none)) { + if (SeenComponent) + return tokError("cannot use 'none' with other component"); + *Current = CaptureComponents::None; + } else { + if (SeenComponent && capturesNothing(*Current)) + return tokError("cannot use 'none' with other component"); + + if (EatIfPresent(lltok::kw_address_is_null)) + *Current |= CaptureComponents::AddressIsNull; + else if (EatIfPresent(lltok::kw_address)) + *Current |= CaptureComponents::Address; + else if (EatIfPresent(lltok::kw_provenance)) + *Current |= CaptureComponents::Provenance; + else if (EatIfPresent(lltok::kw_read_provenance)) + *Current |= CaptureComponents::ReadProvenance; + else + return tokError("expected one of 'none', 'address', 'address_is_null', " + "'provenance' or 'read_provenance'"); + } + + SeenComponent = true; + if (EatIfPresent(lltok::rparen)) + break; + + if (parseToken(lltok::comma, "expected ',' or ')'")) + return true; + } + + B.addCapturesAttr(CaptureInfo(Other, Ret.value_or(Other))); + return false; +} + /// parseOptionalOperandBundles /// ::= /*empty*/ /// ::= '[' OperandBundle [, OperandBundle ]* ']' diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index a01ecf0d56642..56f5ff4b20e5d 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2250,6 +2250,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::CoroElideSafe; case bitc::ATTR_KIND_NO_EXT: return Attribute::NoExt; + case bitc::ATTR_KIND_CAPTURES: + return Attribute::Captures; } } @@ -2389,6 +2391,8 @@ Error BitcodeReader::parseAttributeGroupBlock() { B.addAllocKindAttr(static_cast(Record[++i])); else if (Kind == Attribute::Memory) B.addMemoryAttr(MemoryEffects::createFromIntValue(Record[++i])); + else if (Kind == Attribute::Captures) + B.addCapturesAttr(CaptureInfo::createFromIntValue(Record[++i])); else if (Kind == Attribute::NoFPClass) B.addNoFPClassAttr( static_cast(Record[++i] & fcAllFlags)); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index b4efd3928a2e6..94d3afa6c1e33 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -907,6 +907,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_INITIALIZES; case Attribute::NoExt: return bitc::ATTR_KIND_NO_EXT; + case Attribute::Captures: + return bitc::ATTR_KIND_CAPTURES; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h index 82c501dcafcb7..59cc489ade40d 100644 --- a/llvm/lib/IR/AttributeImpl.h +++ b/llvm/lib/IR/AttributeImpl.h @@ -346,6 +346,7 @@ class AttributeSetNode final UWTableKind getUWTableKind() const; AllocFnKind getAllocKind() const; MemoryEffects getMemoryEffects() const; + CaptureInfo getCaptureInfo() const; FPClassTest getNoFPClass() const; std::string getAsString(bool InAttrGrp) const; Type *getAttributeType(Attribute::AttrKind Kind) const; diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index e9daa01b899e8..ceb31856283c9 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -487,6 +487,12 @@ MemoryEffects Attribute::getMemoryEffects() const { return MemoryEffects::createFromIntValue(pImpl->getValueAsInt()); } +CaptureInfo Attribute::getCaptureInfo() const { + assert(hasAttribute(Attribute::Captures) && + "Can only call getCaptureInfo() on captures attribute"); + return CaptureInfo::createFromIntValue(pImpl->getValueAsInt()); +} + FPClassTest Attribute::getNoFPClass() const { assert(hasAttribute(Attribute::NoFPClass) && "Can only call getNoFPClass() on nofpclass attribute"); @@ -647,6 +653,13 @@ std::string Attribute::getAsString(bool InAttrGrp) const { return Result; } + if (hasAttribute(Attribute::Captures)) { + std::string Result; + raw_string_ostream OS(Result); + OS << getCaptureInfo(); + return Result; + } + if (hasAttribute(Attribute::NoFPClass)) { std::string Result = "nofpclass"; raw_string_ostream OS(Result); @@ -1050,6 +1063,10 @@ AttributeSet::intersectWith(LLVMContext &C, AttributeSet Other) const { Intersected.addMemoryAttr(Attr0.getMemoryEffects() | Attr1.getMemoryEffects()); break; + case Attribute::Captures: + Intersected.addCapturesAttr(Attr0.getCaptureInfo() | + Attr1.getCaptureInfo()); + break; case Attribute::NoFPClass: Intersected.addNoFPClassAttr(Attr0.getNoFPClass() & Attr1.getNoFPClass()); @@ -1170,6 +1187,10 @@ MemoryEffects AttributeSet::getMemoryEffects() const { return SetNode ? SetNode->getMemoryEffects() : MemoryEffects::unknown(); } +CaptureInfo AttributeSet::getCaptureInfo() const { + return SetNode ? SetNode->getCaptureInfo() : CaptureInfo::all(); +} + FPClassTest AttributeSet::getNoFPClass() const { return SetNode ? SetNode->getNoFPClass() : fcNone; } @@ -1358,6 +1379,12 @@ MemoryEffects AttributeSetNode::getMemoryEffects() const { return MemoryEffects::unknown(); } +CaptureInfo AttributeSetNode::getCaptureInfo() const { + if (auto A = findEnumAttribute(Attribute::Captures)) + return A->getCaptureInfo(); + return CaptureInfo::all(); +} + FPClassTest AttributeSetNode::getNoFPClass() const { if (auto A = findEnumAttribute(Attribute::NoFPClass)) return A->getNoFPClass(); @@ -2190,6 +2217,10 @@ AttrBuilder &AttrBuilder::addMemoryAttr(MemoryEffects ME) { return addRawIntAttr(Attribute::Memory, ME.toIntValue()); } +AttrBuilder &AttrBuilder::addCapturesAttr(CaptureInfo CI) { + return addRawIntAttr(Attribute::Captures, CI.toIntValue()); +} + AttrBuilder &AttrBuilder::addNoFPClassAttr(FPClassTest Mask) { if (Mask == fcNone) return *this; @@ -2350,7 +2381,8 @@ AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, AttributeSet AS, .addAttribute(Attribute::DereferenceableOrNull) .addAttribute(Attribute::Writable) .addAttribute(Attribute::DeadOnUnwind) - .addAttribute(Attribute::Initializes); + .addAttribute(Attribute::Initializes) + .addAttribute(Attribute::Captures); if (ASK & ASK_UNSAFE_TO_DROP) Incompatible.addAttribute(Attribute::Nest) .addAttribute(Attribute::SwiftError) diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp index a4eb70edd38d1..d3b3dd11171f1 100644 --- a/llvm/lib/Support/ModRef.cpp +++ b/llvm/lib/Support/ModRef.cpp @@ -12,6 +12,7 @@ #include "llvm/Support/ModRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" using namespace llvm; @@ -50,3 +51,36 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { }); return OS; } + +raw_ostream &llvm::operator<<(raw_ostream &OS, CaptureComponents CC) { + if (capturesNothing(CC)) { + OS << "none"; + return OS; + } + + ListSeparator LS; + if (capturesAddressIsNullOnly(CC)) + OS << LS << "address_is_null"; + else if (capturesAddress(CC)) + OS << LS << "address"; + if (capturesReadProvenanceOnly(CC)) + OS << LS << "read_provenance"; + if (capturesFullProvenance(CC)) + OS << LS << "provenance"; + + return OS; +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, CaptureInfo CI) { + ListSeparator LS; + CaptureComponents Other = CI.getOtherComponents(); + CaptureComponents Ret = CI.getRetComponents(); + + OS << "captures("; + if (!capturesNothing(Other) || Other == Ret) + OS << LS << Other; + if (Other != Ret) + OS << LS << "ret: " << Ret; + OS << ")"; + return OS; +} diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 7ddb9e22c8344..af9813775f242 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -975,6 +975,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::AllocatedPointer: case Attribute::AllocAlign: case Attribute::ByVal: + case Attribute::Captures: case Attribute::Dereferenceable: case Attribute::DereferenceableOrNull: case Attribute::ElementType: diff --git a/llvm/test/Assembler/captures-errors.ll b/llvm/test/Assembler/captures-errors.ll new file mode 100644 index 0000000000000..44788c79a2453 --- /dev/null +++ b/llvm/test/Assembler/captures-errors.ll @@ -0,0 +1,73 @@ +; RUN: split-file --leading-lines %s %t +; RUN: not llvm-as < %t/missing-lparen.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-LPAREN +; RUN: not llvm-as < %t/missing-rparen.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-RPAREN +; RUN: not llvm-as < %t/missing-rparen-none.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-RPAREN-NONE +; RUN: not llvm-as < %t/missing-colon.ll 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-COLON +; RUN: not llvm-as < %t/invalid-component.ll 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID-COMPONENT +; RUN: not llvm-as < %t/duplicate-ret.ll 2>&1 | FileCheck %s --check-prefix=CHECK-DUPLICATE-RET +; RUN: not llvm-as < %t/none-after.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NONE-AFTER +; RUN: not llvm-as < %t/none-before.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NONE-BEFORE +; RUN: not opt -disable-output < %t/non-pointer-type.ll 2>&1 | FileCheck %s --check-prefix=CHECK-NON-POINTER-TYPE + +;--- missing-lparen.ll + +; CHECK-MISSING-LPAREN: :[[@LINE+1]]:32: error: expected '(' +define void @test(ptr captures %p) { + ret void +} + +;--- missing-rparen.ll + +; CHECK-MISSING-RPAREN: :[[@LINE+1]]:40: error: expected ',' or ')' +define void @test(ptr captures(address %p) { + ret void +} + +;--- missing-rparen-none.ll + +; CHECK-MISSING-RPAREN-NONE: :[[@LINE+1]]:37: error: expected ',' or ')' +define void @test(ptr captures(none %p) { + ret void +} + +;--- missing-colon.ll + +; CHECK-MISSING-COLON: :[[@LINE+1]]:36: error: expected ':' +define void @test(ptr captures(ret address) %p) { + ret void +} + +;--- invalid-component.ll + +; CHECK-INVALID-COMPONENT: :[[@LINE+1]]:32: error: expected one of 'none', 'address', 'address_is_null', 'provenance' or 'read_provenance' +define void @test(ptr captures(foo) %p) { + ret void +} + +;--- duplicate-ret.ll + +; CHECK-DUPLICATE-RET: :[[@LINE+1]]:51: error: duplicate 'ret' location +define void @test(ptr captures(ret: address, ret: provenance) %p) { + ret void +} + +;--- none-after.ll + +; CHECK-NONE-AFTER: :[[@LINE+1]]:45: error: cannot use 'none' with other component +define void @test(ptr captures(address, none) %p) { + ret void +} + +;--- none-before.ll + +; CHECK-NONE-BEFORE: :[[@LINE+1]]:38: error: cannot use 'none' with other component +define void @test(ptr captures(none, address) %p) { + ret void +} + +;--- non-pointer-type.ll + +; CHECK-NON-POINTER-TYPE: Attribute 'captures(none)' applied to incompatible type! +define void @test(i32 captures(none) %p) { + ret void +} diff --git a/llvm/test/Assembler/captures.ll b/llvm/test/Assembler/captures.ll new file mode 100644 index 0000000000000..1521a9df0cb42 --- /dev/null +++ b/llvm/test/Assembler/captures.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S < %s | FileCheck %s +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +define void @test_none(ptr captures(none) %p) { +; CHECK-LABEL: define void @test_none( +; CHECK-SAME: ptr captures(none) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address(ptr captures(address) %p) { +; CHECK-LABEL: define void @test_address( +; CHECK-SAME: ptr captures(address) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_is_null(ptr captures(address_is_null) %p) { +; CHECK-LABEL: define void @test_address_is_null( +; CHECK-SAME: ptr captures(address_is_null) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_provenance(ptr captures(address, provenance) %p) { +; CHECK-LABEL: define void @test_address_provenance( +; CHECK-SAME: ptr captures(address, provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_read_provenance(ptr captures(address, read_provenance) %p) { +; CHECK-LABEL: define void @test_address_read_provenance( +; CHECK-SAME: ptr captures(address, read_provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_ret(ptr captures(ret: address, provenance) %p) { +; CHECK-LABEL: define void @test_ret( +; CHECK-SAME: ptr captures(ret: address, provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_is_null_and_ret(ptr captures(address_is_null, ret: address, provenance) %p) { +; CHECK-LABEL: define void @test_address_is_null_and_ret( +; CHECK-SAME: ptr captures(address_is_null, ret: address, provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_address_and_ret_none(ptr captures(address, ret: none) %p) { +; CHECK-LABEL: define void @test_address_and_ret_none( +; CHECK-SAME: ptr captures(address, ret: none) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; Duplicates callpse into one. +define void @test_duplicate(ptr captures(address, address) %p) { +; CHECK-LABEL: define void @test_duplicate( +; CHECK-SAME: ptr captures(address) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; read_provenance is a subset of provenance. +define void @test_duplicate_read_provenance(ptr captures(read_provenance, provenance) %p) { +; CHECK-LABEL: define void @test_duplicate_read_provenance( +; CHECK-SAME: ptr captures(provenance) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; address_is_null is a subset of address. +define void @test_duplicate_address_is_null(ptr captures(address_is_null, address) %p) { +; CHECK-LABEL: define void @test_duplicate_address_is_null( +; CHECK-SAME: ptr captures(address) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} + +; Return-only none is same as plain none. +define void @test_ret_none(ptr captures(ret: none) %p) { +; CHECK-LABEL: define void @test_ret_none( +; CHECK-SAME: ptr captures(none) [[P:%.*]]) { +; CHECK-NEXT: ret void +; + ret void +} diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll index 492de663884df..1da9291c71996 100644 --- a/llvm/test/Bitcode/attributes.ll +++ b/llvm/test/Bitcode/attributes.ll @@ -562,6 +562,11 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) { ret void } +; CHECK: define void @captures(ptr captures(address) %p) +define void @captures(ptr captures(address) %p) { + ret void +} + ; CHECK: attributes #0 = { noreturn } ; CHECK: attributes #1 = { nounwind } ; CHECK: attributes #2 = { memory(none) } diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp index f73f2b20e9fea..f0e34aa273369 100644 --- a/llvm/unittests/IR/AttributesTest.cpp +++ b/llvm/unittests/IR/AttributesTest.cpp @@ -437,6 +437,14 @@ TEST(Attributes, SetIntersect) { break; case Attribute::Range: break; + case Attribute::Captures: + V0 = CaptureInfo(CaptureComponents::AddressIsNull, + CaptureComponents::None) + .toIntValue(); + V1 = CaptureInfo(CaptureComponents::None, + CaptureComponents::ReadProvenance) + .toIntValue(); + break; default: ASSERT_FALSE(true); } @@ -516,6 +524,11 @@ TEST(Attributes, SetIntersect) { ASSERT_EQ(Res->getAttribute(Kind).getRange(), ConstantRange(APInt(32, 0), APInt(32, 20))); break; + case Attribute::Captures: + ASSERT_EQ(Res->getCaptureInfo(), + CaptureInfo(CaptureComponents::AddressIsNull, + CaptureComponents::ReadProvenance)); + break; default: ASSERT_FALSE(true); } From ba20f9bda9e66526bdaf943380fd49c7abdbd18f Mon Sep 17 00:00:00 2001 From: Lukacma Date: Mon, 13 Jan 2025 13:44:15 +0000 Subject: [PATCH 053/102] [AArch64] Change feature dependencies of fp8 features (#122280) This patch simplifies feature dependencies of FP8 features and also adds new tests to check these. --- .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 12 ++++---- .../sme2-intrinsics/acle_sme2_fp8_fdot.c | 10 +++---- .../sme2-intrinsics/acle_sme2_fp8_fvdot.c | 10 +++---- .../acle_sme2_fp8_imm.c | 2 +- llvm/lib/Target/AArch64/AArch64Features.td | 10 +++---- llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll | 4 +-- .../TargetParser/TargetParserTest.cpp | 29 +++++++++++++++++-- 7 files changed, 51 insertions(+), 26 deletions(-) diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c index 950a19115811e..2f3994df03784 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c @@ -1,12 +1,12 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c index a151d162e0108..2da4ab541869e 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s #include diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c index fc95cf541172a..8353b3aebc9fc 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s // REQUIRES: aarch64-registered-target diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c index bea0b29bcc70a..fd5374d928ea9 100644 --- a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_imm.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme-f8f16 -target-feature +sme-f8f32 -fsyntax-only -verify %s // REQUIRES: aarch64-registered-target diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 41eb9a73bd013..5a233e2d870b3 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -486,16 +486,16 @@ def FeatureSSVE_FP8FMA : ExtensionWithMArch<"ssve-fp8fma", "SSVE_FP8FMA", "FEAT_ "Enable SVE2 FP8 multiply-add instructions", [FeatureSME2, FeatureFP8]>; def FeatureFP8DOT4: ExtensionWithMArch<"fp8dot4", "FP8DOT4", "FEAT_FP8DOT4", - "Enable FP8 4-way dot instructions", [FeatureFP8FMA]>; + "Enable FP8 4-way dot instructions", [FeatureNEON, FeatureFP8]>; def FeatureFP8DOT2: ExtensionWithMArch<"fp8dot2", "FP8DOT2", "FEAT_FP8DOT2", - "Enable FP8 2-way dot instructions", [FeatureFP8DOT4]>; + "Enable FP8 2-way dot instructions", [FeatureNEON, FeatureFP8]>; def FeatureSSVE_FP8DOT4 : ExtensionWithMArch<"ssve-fp8dot4", "SSVE_FP8DOT4", "FEAT_SSVE_FP8DOT4", - "Enable SVE2 FP8 4-way dot product instructions", [FeatureSSVE_FP8FMA]>; + "Enable SVE2 FP8 4-way dot product instructions", [FeatureSME2, FeatureFP8]>; def FeatureSSVE_FP8DOT2 : ExtensionWithMArch<"ssve-fp8dot2", "SSVE_FP8DOT2", "FEAT_SSVE_FP8DOT2", - "Enable SVE2 FP8 2-way dot product instructions", [FeatureSSVE_FP8DOT4]>; + "Enable SVE2 FP8 2-way dot product instructions", [FeatureSME2, FeatureFP8]>; def FeatureSME_LUTv2 : ExtensionWithMArch<"sme-lutv2", "SME_LUTv2", "FEAT_SME_LUTv2", "Enable Scalable Matrix Extension (SME) LUTv2 instructions", [FeatureSME2]>; @@ -504,7 +504,7 @@ def FeatureSMEF8F32 : ExtensionWithMArch<"sme-f8f32", "SMEF8F32", "FEAT_SME_F8F3 "Enable Scalable Matrix Extension (SME) F8F32 instructions", [FeatureSME2, FeatureFP8]>; def FeatureSMEF8F16 : ExtensionWithMArch<"sme-f8f16", "SMEF8F16", "FEAT_SME_F8F16", - "Enable Scalable Matrix Extension (SME) F8F16 instructions", [FeatureSMEF8F32]>; + "Enable Scalable Matrix Extension (SME) F8F16 instructions", [FeatureSME2, FeatureFP8]>; def FeatureCPA : ExtensionWithMArch<"cpa", "CPA", "FEAT_CPA", "Enable Armv9.5-A Checked Pointer Arithmetic">; diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll index 0cead19a74bfd..478404dcd50aa 100644 --- a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll +++ b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve2,+fp8,+fp8dot2,+fp8dot4 < %s | FileCheck %s -; RUN: llc -mattr=+sme,+fp8,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sve2,+fp8dot2,+fp8dot4 < %s | FileCheck %s +; RUN: llc -mattr=+sme,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s target triple = "aarch64-linux" diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 1f69190e4bec5..c03d3e8575d81 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1803,7 +1803,7 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"nofp", "fprcvt"}, {"fp-armv8", "fprcvt"}, {}}, {AArch64::ARMV9_6A, {"fprcvt", "nofp"}, {}, {"fp-armv8", "fprcvt"}}, - // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm} + // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, fp8dot4, fp8dot2} {AArch64::ARMV8A, {"nosimd", "aes"}, {"neon", "aes"}, {}}, {AArch64::ARMV8A, {"aes", "nosimd"}, {}, {"neon", "aes"}}, {AArch64::ARMV8A, {"nosimd", "sha2"}, {"neon", "sha2"}, {}}, @@ -1816,6 +1816,10 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"f8f16mm", "nosimd"}, {}, {"neon", "f8f16mm"}}, {AArch64::ARMV9_6A, {"nosimd", "f8f32mm"}, {"neon", "f8f32mm"}, {}}, {AArch64::ARMV9_6A, {"f8f32mm", "nosimd"}, {}, {"neon", "f8f32mm"}}, + {AArch64::ARMV9_6A, {"nosimd", "fp8dot4"}, {"neon", "fp8dot4"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot4", "nosimd"}, {}, {"neon", "fp8dot4"}}, + {AArch64::ARMV9_6A, {"nosimd", "fp8dot2"}, {"neon", "fp8dot2"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot2", "nosimd"}, {}, {"neon", "fp8dot2"}}, // simd -> {rdm, dotprod, fcma} {AArch64::ARMV8A, {"nosimd", "rdm"}, {"neon", "rdm"}, {}}, @@ -1940,7 +1944,8 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"nosme2p1", "sme2p2"}, {"sme2p2", "sme2p1"}, {}}, {AArch64::ARMV9_6A, {"sme2p2", "nosme2p1"}, {}, {"sme2p1", "sme2p2"}}, - // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm} + // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm, fp8dot4, fp8dot2, + // ssve-fp8dot4, ssve-fp8dot2} {AArch64::ARMV8A, {"nofp8", "sme-f8f16"}, {"fp8", "sme-f8f16"}, {}}, {AArch64::ARMV8A, {"sme-f8f16", "nofp8"}, {}, {"fp8", "sme-f8f16"}}, {AArch64::ARMV8A, {"nofp8", "sme-f8f32"}, {"fp8", "sme-f8f32"}, {}}, @@ -1949,6 +1954,26 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV9_6A, {"f8f16mm", "nofp8"}, {}, {"fp8", "f8f16mm"}}, {AArch64::ARMV9_6A, {"nofp8", "f8f32mm"}, {"fp8", "f8f32mm"}, {}}, {AArch64::ARMV9_6A, {"f8f32mm", "nofp8"}, {}, {"fp8", "f8f32mm"}}, + {AArch64::ARMV9_6A, {"nofp8", "fp8dot4"}, {"fp8", "fp8dot4"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot4", "nofp8"}, {}, {"fp8", "fp8dot4"}}, + {AArch64::ARMV9_6A, {"nofp8", "fp8dot2"}, {"fp8", "fp8dot2"}, {}}, + {AArch64::ARMV9_6A, {"fp8dot2", "nofp8"}, {}, {"fp8", "fp8dot2"}}, + {AArch64::ARMV9_6A, + {"nofp8", "ssve-fp8dot4"}, + {"fp8", "ssve-fp8dot4"}, + {}}, + {AArch64::ARMV9_6A, + {"ssve-fp8dot4", "nofp8"}, + {}, + {"fp8", "ssve-fp8dot4"}}, + {AArch64::ARMV9_6A, + {"nofp8", "ssve-fp8dot2"}, + {"fp8", "ssve-fp8dot2"}, + {}}, + {AArch64::ARMV9_6A, + {"ssve-fp8dot2", "nofp8"}, + {}, + {"fp8", "ssve-fp8dot2"}}, // lse -> lse128 {AArch64::ARMV8A, {"nolse", "lse128"}, {"lse", "lse128"}, {}}, From 35d472078974a255fa6e84b223673b33926a60c1 Mon Sep 17 00:00:00 2001 From: David Pagan Date: Mon, 13 Jan 2025 05:44:48 -0800 Subject: [PATCH 054/102] [clang][OpenMP] Add 'align' modifier for 'allocate' clause (#121814) The 'align' modifier is now accepted in the 'allocate' clause. Added LIT tests covering codegen, PCH, template handling, and serialization for 'align' modifier. Added support for align-modifier to release notes. Testing - New allocate modifier LIT tests. - OpenMP LIT tests. - check-all --- clang/docs/OpenMPSupport.rst | 2 + clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/OpenMPClause.h | 92 +++- .../clang/Basic/DiagnosticParseKinds.td | 2 + clang/include/clang/Basic/OpenMPKinds.def | 1 + clang/include/clang/Basic/OpenMPKinds.h | 4 + clang/include/clang/Sema/SemaOpenMP.h | 20 +- clang/lib/AST/OpenMPClause.cpp | 58 ++- clang/lib/Parse/ParseOpenMP.cpp | 96 +++- clang/lib/Sema/SemaOpenMP.cpp | 102 +++-- clang/lib/Sema/TreeTransform.h | 30 +- clang/lib/Serialization/ASTReader.cpp | 4 +- clang/lib/Serialization/ASTWriter.cpp | 4 +- .../allocate_allocator_modifier_codegen.cpp | 255 ----------- .../allocate_allocator_modifier_messages.cpp | 97 ----- ...t.cpp => allocate_modifiers_ast_print.cpp} | 77 +++- .../OpenMP/allocate_modifiers_codegen.cpp | 409 ++++++++++++++++++ .../OpenMP/allocate_modifiers_messages.cpp | 159 +++++++ 18 files changed, 962 insertions(+), 451 deletions(-) delete mode 100644 clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp delete mode 100644 clang/test/OpenMP/allocate_allocator_modifier_messages.cpp rename clang/test/OpenMP/{allocate_allocator_modifier_ast_print.cpp => allocate_modifiers_ast_print.cpp} (51%) create mode 100644 clang/test/OpenMP/allocate_modifiers_codegen.cpp create mode 100644 clang/test/OpenMP/allocate_modifiers_messages.cpp diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index a1cb7fe359ebf..673c34bf08a4a 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -286,6 +286,8 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory management | 'allocator' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/114883 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| memory management | 'align' modifier for allocate clause | :good:`done` | https://github.com/llvm/llvm-project/pull/121814 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory management | new memory management routines | :none:`unclaimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory management | changes to omp_alloctrait_key enum | :none:`unclaimed` | | diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8f4adbcd70518..9eeb872aa57d7 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1370,6 +1370,7 @@ OpenMP Support always build support for AMDGPU and NVPTX targets. - Added support for combined masked constructs 'omp parallel masked taskloop', 'omp parallel masked taskloop simd','omp masked taskloop' and 'omp masked taskloop simd' directive. +- Added support for align-modifier in 'allocate' clause. Improvements ^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index d2f5267e4da5e..b9088eff3bb52 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -498,6 +498,9 @@ class OMPAllocateClause final /// Allocator specified in the clause, or 'nullptr' if the default one is /// used. Expr *Allocator = nullptr; + /// Alignment specified in the clause, or 'nullptr' if the default one is + /// used. + Expr *Alignment = nullptr; /// Position of the ':' delimiter in the clause; SourceLocation ColonLoc; /// Modifier of 'allocate' clause. @@ -505,6 +508,41 @@ class OMPAllocateClause final /// Location of allocator modifier if any. SourceLocation AllocatorModifierLoc; + // ---------------------------------------------------------------------------- + + /// Modifiers for 'allocate' clause. + enum { FIRST, SECOND, NUM_MODIFIERS }; + OpenMPAllocateClauseModifier Modifiers[NUM_MODIFIERS]; + + /// Locations of modifiers. + SourceLocation ModifiersLoc[NUM_MODIFIERS]; + + /// Set the first allocate modifier. + /// + /// \param M Allocate modifier. + void setFirstAllocateModifier(OpenMPAllocateClauseModifier M) { + Modifiers[FIRST] = M; + } + + /// Set the second allocate modifier. + /// + /// \param M Allocate modifier. + void setSecondAllocateModifier(OpenMPAllocateClauseModifier M) { + Modifiers[SECOND] = M; + } + + /// Set location of the first allocate modifier. + void setFirstAllocateModifierLoc(SourceLocation Loc) { + ModifiersLoc[FIRST] = Loc; + } + + /// Set location of the second allocate modifier. + void setSecondAllocateModifierLoc(SourceLocation Loc) { + ModifiersLoc[SECOND] = Loc; + } + + // ---------------------------------------------------------------------------- + /// Build clause with number of variables \a N. /// /// \param StartLoc Starting location of the clause. @@ -514,15 +552,20 @@ class OMPAllocateClause final /// \param EndLoc Ending location of the clause. /// \param N Number of the variables in the clause. OMPAllocateClause(SourceLocation StartLoc, SourceLocation LParenLoc, - Expr *Allocator, SourceLocation ColonLoc, - OpenMPAllocateClauseModifier AllocatorModifier, - SourceLocation AllocatorModifierLoc, SourceLocation EndLoc, + Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc, + OpenMPAllocateClauseModifier Modifier1, + SourceLocation Modifier1Loc, + OpenMPAllocateClauseModifier Modifier2, + SourceLocation Modifier2Loc, SourceLocation EndLoc, unsigned N) : OMPVarListClause(llvm::omp::OMPC_allocate, StartLoc, LParenLoc, EndLoc, N), - Allocator(Allocator), ColonLoc(ColonLoc), - AllocatorModifier(AllocatorModifier), - AllocatorModifierLoc(AllocatorModifierLoc) {} + Allocator(Allocator), Alignment(Alignment), ColonLoc(ColonLoc) { + Modifiers[FIRST] = Modifier1; + Modifiers[SECOND] = Modifier2; + ModifiersLoc[FIRST] = Modifier1Loc; + ModifiersLoc[SECOND] = Modifier2Loc; + } /// Build an empty clause. /// @@ -530,7 +573,10 @@ class OMPAllocateClause final explicit OMPAllocateClause(unsigned N) : OMPVarListClause(llvm::omp::OMPC_allocate, SourceLocation(), SourceLocation(), - SourceLocation(), N) {} + SourceLocation(), N) { + Modifiers[FIRST] = OMPC_ALLOCATE_unknown; + Modifiers[SECOND] = OMPC_ALLOCATE_unknown; + } /// Sets location of ':' symbol in clause. void setColonLoc(SourceLocation CL) { ColonLoc = CL; } @@ -539,6 +585,7 @@ class OMPAllocateClause final void setAllocatorModifier(OpenMPAllocateClauseModifier AM) { AllocatorModifier = AM; } + void setAlignment(Expr *A) { Alignment = A; } public: /// Creates clause with a list of variables \a VL. @@ -554,19 +601,42 @@ class OMPAllocateClause final /// \param VL List of references to the variables. static OMPAllocateClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, - Expr *Allocator, SourceLocation ColonLoc, - OpenMPAllocateClauseModifier AllocatorModifier, - SourceLocation AllocatorModifierLoc, SourceLocation EndLoc, - ArrayRef VL); + Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc, + OpenMPAllocateClauseModifier Modifier1, SourceLocation Modifier1Loc, + OpenMPAllocateClauseModifier Modifier2, SourceLocation Modifier2Loc, + SourceLocation EndLoc, ArrayRef VL); /// Returns the allocator expression or nullptr, if no allocator is specified. Expr *getAllocator() const { return Allocator; } + /// Returns the alignment expression or nullptr, if no alignment specified. + Expr *getAlignment() const { return Alignment; } + /// Return 'allocate' modifier. OpenMPAllocateClauseModifier getAllocatorModifier() const { return AllocatorModifier; } + /// Get the first modifier of the clause. + OpenMPAllocateClauseModifier getFirstAllocateModifier() const { + return Modifiers[FIRST]; + } + + /// Get location of first modifier of the clause. + SourceLocation getFirstAllocateModifierLoc() const { + return ModifiersLoc[FIRST]; + } + + /// Get the second modifier of the clause. + OpenMPAllocateClauseModifier getSecondAllocateModifier() const { + return Modifiers[SECOND]; + } + + /// Get location of second modifier of the clause. + SourceLocation getSecondAllocateModifierLoc() const { + return ModifiersLoc[SECOND]; + } + /// Returns the location of the ':' delimiter. SourceLocation getColonLoc() const { return ColonLoc; } /// Return the location of the modifier. diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 86fcae209c40d..3309f59a981fc 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1658,6 +1658,8 @@ def warn_omp_depend_in_ordered_deprecated : Warning<"'depend' clause for" def warn_omp_invalid_attribute_for_ompx_attributes : Warning<"'ompx_attribute' clause only allows " "'amdgpu_flat_work_group_size', 'amdgpu_waves_per_eu', and 'launch_bounds'; " "%0 is ignored">, InGroup; +def err_omp_duplicate_modifier : Error<"duplicate modifier '%0' in '%1' clause">; +def err_omp_expected_modifier : Error<"expected modifier in '%0' clause">; // Pragma loop support. def err_pragma_loop_missing_argument : Error< diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index 3f25e7aafe23b..76a861f416fd5 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -219,6 +219,7 @@ OPENMP_NUMTASKS_MODIFIER(strict) // Modifiers for 'allocate' clause. OPENMP_ALLOCATE_MODIFIER(allocator) +OPENMP_ALLOCATE_MODIFIER(align) // Modifiers for the 'doacross' clause. OPENMP_DOACROSS_MODIFIER(source) diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index 900ad6ca6d66f..3e5da2a6abc01 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -230,6 +230,10 @@ enum OpenMPAllocateClauseModifier { OMPC_ALLOCATE_unknown }; +/// Number of allowed allocate-modifiers. +static constexpr unsigned NumberOfOMPAllocateClauseModifiers = + OMPC_ALLOCATE_unknown; + /// Contains 'interop' data for 'append_args' and 'init' clauses. class Expr; struct OMPInteropInfo final { diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index 3d1cc4fab1c10..a056a96f50233 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -1148,7 +1148,12 @@ class SemaOpenMP : public SemaBase { SourceLocation OmpAllMemoryLoc; SourceLocation StepModifierLoc; /// 'step' modifier location for linear clause - OpenMPAllocateClauseModifier AllocClauseModifier = OMPC_ALLOCATE_unknown; + SmallVector + AllocClauseModifiers; + SmallVector + AllocClauseModifiersLoc; + Expr *AllocateAlignment = nullptr; }; OMPClause *ActOnOpenMPVarListClause(OpenMPClauseKind Kind, @@ -1166,10 +1171,15 @@ class SemaOpenMP : public SemaBase { SourceLocation LParenLoc, SourceLocation EndLoc); /// Called on well-formed 'allocate' clause. - OMPClause *ActOnOpenMPAllocateClause( - Expr *Allocator, OpenMPAllocateClauseModifier ACModifier, - ArrayRef VarList, SourceLocation StartLoc, - SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + OMPClause * + ActOnOpenMPAllocateClause(Expr *Allocator, Expr *Alignment, + OpenMPAllocateClauseModifier FirstModifier, + SourceLocation FirstModifierLoc, + OpenMPAllocateClauseModifier SecondModifier, + SourceLocation SecondModifierLoc, + ArrayRef VarList, SourceLocation StartLoc, + SourceLocation ColonLoc, SourceLocation LParenLoc, + SourceLocation EndLoc); /// Called on well-formed 'private' clause. OMPClause *ActOnOpenMPPrivateClause(ArrayRef VarList, SourceLocation StartLoc, diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 4246ba95d827f..532933d6183ce 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -1019,19 +1019,18 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) { return new (C) OMPPartialClause(); } -OMPAllocateClause * -OMPAllocateClause::Create(const ASTContext &C, SourceLocation StartLoc, - SourceLocation LParenLoc, Expr *Allocator, - SourceLocation ColonLoc, - OpenMPAllocateClauseModifier AllocatorModifier, - SourceLocation AllocatorModifierLoc, - SourceLocation EndLoc, ArrayRef VL) { +OMPAllocateClause *OMPAllocateClause::Create( + const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, + Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc, + OpenMPAllocateClauseModifier Modifier1, SourceLocation Modifier1Loc, + OpenMPAllocateClauseModifier Modifier2, SourceLocation Modifier2Loc, + SourceLocation EndLoc, ArrayRef VL) { // Allocate space for private variables and initializer expressions. void *Mem = C.Allocate(totalSizeToAlloc(VL.size())); auto *Clause = new (Mem) OMPAllocateClause( - StartLoc, LParenLoc, Allocator, ColonLoc, AllocatorModifier, - AllocatorModifierLoc, EndLoc, VL.size()); + StartLoc, LParenLoc, Allocator, Alignment, ColonLoc, Modifier1, + Modifier1Loc, Modifier2, Modifier2Loc, EndLoc, VL.size()); Clause->setVarRefs(VL); return Clause; @@ -2245,21 +2244,48 @@ void OMPClausePrinter::VisitOMPClauseList(T *Node, char StartSym) { void OMPClausePrinter::VisitOMPAllocateClause(OMPAllocateClause *Node) { if (Node->varlist_empty()) return; + + Expr *FirstModifier = nullptr; + Expr *SecondModifier = nullptr; + auto FirstAllocMod = Node->getFirstAllocateModifier(); + auto SecondAllocMod = Node->getSecondAllocateModifier(); + bool FirstUnknown = FirstAllocMod == OMPC_ALLOCATE_unknown; + bool SecondUnknown = SecondAllocMod == OMPC_ALLOCATE_unknown; + if (FirstAllocMod == OMPC_ALLOCATE_allocator || + (FirstAllocMod == OMPC_ALLOCATE_unknown && Node->getAllocator())) { + FirstModifier = Node->getAllocator(); + SecondModifier = Node->getAlignment(); + } else { + FirstModifier = Node->getAlignment(); + SecondModifier = Node->getAllocator(); + } + OS << "allocate"; - OpenMPAllocateClauseModifier Modifier = Node->getAllocatorModifier(); - if (Expr *Allocator = Node->getAllocator()) { + // If we have any explicit modifiers. + if (FirstModifier) { OS << "("; - if (Modifier == OMPC_ALLOCATE_allocator) { - OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), Modifier); + if (!FirstUnknown) { + OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), FirstAllocMod); OS << "("; - Allocator->printPretty(OS, nullptr, Policy, 0); + } + FirstModifier->printPretty(OS, nullptr, Policy, 0); + if (!FirstUnknown) OS << ")"; - } else { - Allocator->printPretty(OS, nullptr, Policy, 0); + if (SecondModifier) { + OS << ", "; + if (!SecondUnknown) { + OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), + SecondAllocMod); + OS << "("; + } + SecondModifier->printPretty(OS, nullptr, Policy, 0); + if (!SecondUnknown) + OS << ")"; } OS << ":"; VisitOMPClauseList(Node, ' '); } else { + // No modifiers. Just print the variable list. VisitOMPClauseList(Node, '('); } OS << ")"; diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index b4e973bc84a7b..89b83938f352d 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -4530,32 +4530,88 @@ static bool parseStepSize(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data, } /// Parse 'allocate' clause modifiers. -/// If allocator-modifier exists, return an expression for it and set -/// Data field noting modifier was specified. -/// +/// If allocator-modifier exists, return an expression for it. For both +/// allocator and align modifiers, set Data fields as appropriate. static ExprResult parseOpenMPAllocateClauseModifiers(Parser &P, OpenMPClauseKind Kind, SemaOpenMP::OpenMPVarListDataTy &Data) { const Token &Tok = P.getCurToken(); Preprocessor &PP = P.getPreprocessor(); ExprResult Tail; - auto Modifier = static_cast( + ExprResult Val; + SourceLocation RLoc; + bool AllocatorSeen = false; + bool AlignSeen = false; + SourceLocation CurrentModifierLoc = Tok.getLocation(); + auto CurrentModifier = static_cast( getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok), P.getLangOpts())); - if (Modifier == OMPC_ALLOCATE_allocator) { - Data.AllocClauseModifier = Modifier; + + // Modifiers did not exist before 5.1 + if (P.getLangOpts().OpenMP < 51) + return P.ParseAssignmentExpression(); + + // An allocator-simple-modifier is exclusive and must appear alone. See + // OpenMP6.0 spec, pg. 313, L1 on Modifiers, as well as Table 5.1, pg. 50, + // description of "exclusive" property. If we don't recognized an explicit + // simple-/complex- modifier, assume we're looking at expression + // representing allocator and consider ourselves done. + if (CurrentModifier == OMPC_ALLOCATE_unknown) + return P.ParseAssignmentExpression(); + + do { P.ConsumeToken(); - BalancedDelimiterTracker AllocateT(P, tok::l_paren, - tok::annot_pragma_openmp_end); if (Tok.is(tok::l_paren)) { - AllocateT.consumeOpen(); - Tail = P.ParseAssignmentExpression(); - AllocateT.consumeClose(); + switch (CurrentModifier) { + case OMPC_ALLOCATE_allocator: { + if (AllocatorSeen) { + P.Diag(Tok, diag::err_omp_duplicate_modifier) + << getOpenMPSimpleClauseTypeName(OMPC_allocate, CurrentModifier) + << getOpenMPClauseName(Kind); + } else { + Data.AllocClauseModifiers.push_back(CurrentModifier); + Data.AllocClauseModifiersLoc.push_back(CurrentModifierLoc); + } + BalancedDelimiterTracker AllocateT(P, tok::l_paren, + tok::annot_pragma_openmp_end); + AllocateT.consumeOpen(); + Tail = P.ParseAssignmentExpression(); + AllocateT.consumeClose(); + AllocatorSeen = true; + break; + } + case OMPC_ALLOCATE_align: { + if (AlignSeen) { + P.Diag(Tok, diag::err_omp_duplicate_modifier) + << getOpenMPSimpleClauseTypeName(OMPC_allocate, CurrentModifier) + << getOpenMPClauseName(Kind); + } else { + Data.AllocClauseModifiers.push_back(CurrentModifier); + Data.AllocClauseModifiersLoc.push_back(CurrentModifierLoc); + } + Val = P.ParseOpenMPParensExpr(getOpenMPClauseName(Kind), RLoc); + if (Val.isUsable()) + Data.AllocateAlignment = Val.get(); + AlignSeen = true; + break; + } + default: + llvm_unreachable("Unexpected allocate modifier"); + } } else { P.Diag(Tok, diag::err_expected) << tok::l_paren; } - } else { - Tail = P.ParseAssignmentExpression(); - } + if (Tok.isNot(tok::comma)) + break; + P.ConsumeToken(); + CurrentModifierLoc = Tok.getLocation(); + CurrentModifier = static_cast( + getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok), P.getLangOpts())); + // A modifier followed by a comma implies another modifier. + if (CurrentModifier == OMPC_ALLOCATE_unknown) { + P.Diag(Tok, diag::err_omp_expected_modifier) << getOpenMPClauseName(Kind); + break; + } + } while (!AllocatorSeen || !AlignSeen); return Tail; } @@ -4832,7 +4888,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, } else if (Kind == OMPC_allocate || (Kind == OMPC_affinity && Tok.is(tok::identifier) && PP.getSpelling(Tok) == "iterator")) { - // Handle optional allocator expression followed by colon delimiter. + // Handle optional allocator and align modifiers followed by colon + // delimiter. ColonProtectionRAIIObject ColonRAII(*this); TentativeParsingAction TPA(*this); // OpenMP 5.0, 2.10.1, task Construct. @@ -4849,19 +4906,18 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, Tail = Actions.CorrectDelayedTyposInExpr(Tail); Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(), /*DiscardedValue=*/false); - if (Tail.isUsable()) { + if (Tail.isUsable() || Data.AllocateAlignment) { if (Tok.is(tok::colon)) { - Data.DepModOrTailExpr = Tail.get(); + Data.DepModOrTailExpr = Tail.isUsable() ? Tail.get() : nullptr; Data.ColonLoc = ConsumeToken(); TPA.Commit(); } else { // Colon not found, parse only list of variables. TPA.Revert(); - if (Kind == OMPC_allocate && - Data.AllocClauseModifier == OMPC_ALLOCATE_allocator) { + if (Kind == OMPC_allocate && Data.AllocClauseModifiers.size()) { SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); - Diag(Tok, diag::err_modifier_expected_colon) << "allocator"; + Diag(Tok, diag::err_modifier_expected_colon) << "allocate clause"; } } } else { diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 66ff92f554fc4..b83b2b12f4a23 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -5320,6 +5320,8 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack, Expr *SimpleRefExpr = E; auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange); ValueDecl *VD = Res.first; + if (!VD) + continue; DSAStackTy::DSAVarData Data = Stack->getTopDSA(VD, /*FromParent=*/false); if (!isOpenMPPrivate(Data.CKind)) { S.Diag(E->getExprLoc(), @@ -5330,10 +5332,8 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack, if (checkPreviousOMPAllocateAttribute(S, Stack, E, PrivateVD, AllocatorKind, AC->getAllocator())) continue; - // Placeholder until allocate clause supports align modifier. - Expr *Alignment = nullptr; applyOMPAllocateAttribute(S, PrivateVD, AllocatorKind, AC->getAllocator(), - Alignment, E->getSourceRange()); + AC->getAlignment(), E->getSourceRange()); } } } @@ -15617,7 +15617,9 @@ ExprResult SemaOpenMP::VerifyPositiveIntegerConstantInClause( << E->getSourceRange(); return ExprError(); } - if ((CKind == OMPC_aligned || CKind == OMPC_align) && !Result.isPowerOf2()) { + if ((CKind == OMPC_aligned || CKind == OMPC_align || + CKind == OMPC_allocate) && + !Result.isPowerOf2()) { Diag(E->getExprLoc(), diag::warn_omp_alignment_not_power_of_two) << E->getSourceRange(); return ExprError(); @@ -17153,11 +17155,26 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind, case OMPC_has_device_addr: Res = ActOnOpenMPHasDeviceAddrClause(VarList, Locs); break; - case OMPC_allocate: - Res = ActOnOpenMPAllocateClause(Data.DepModOrTailExpr, - Data.AllocClauseModifier, VarList, StartLoc, - LParenLoc, ColonLoc, EndLoc); + case OMPC_allocate: { + OpenMPAllocateClauseModifier Modifier1 = OMPC_ALLOCATE_unknown; + OpenMPAllocateClauseModifier Modifier2 = OMPC_ALLOCATE_unknown; + SourceLocation Modifier1Loc, Modifier2Loc; + if (!Data.AllocClauseModifiers.empty()) { + assert(Data.AllocClauseModifiers.size() <= 2 && + "More allocate modifiers than expected"); + Modifier1 = Data.AllocClauseModifiers[0]; + Modifier1Loc = Data.AllocClauseModifiersLoc[0]; + if (Data.AllocClauseModifiers.size() == 2) { + Modifier2 = Data.AllocClauseModifiers[1]; + Modifier2Loc = Data.AllocClauseModifiersLoc[1]; + } + } + Res = ActOnOpenMPAllocateClause( + Data.DepModOrTailExpr, Data.AllocateAlignment, Modifier1, Modifier1Loc, + Modifier2, Modifier2Loc, VarList, StartLoc, LParenLoc, ColonLoc, + EndLoc); break; + } case OMPC_nontemporal: Res = ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, EndLoc); break; @@ -23163,32 +23180,37 @@ SemaOpenMP::ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, } OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( - Expr *Allocator, OpenMPAllocateClauseModifier AllocClauseModifier, - ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation ColonLoc, SourceLocation EndLoc) { - + Expr *Allocator, Expr *Alignment, + OpenMPAllocateClauseModifier FirstAllocateModifier, + SourceLocation FirstAllocateModifierLoc, + OpenMPAllocateClauseModifier SecondAllocateModifier, + SourceLocation SecondAllocateModifierLoc, ArrayRef VarList, + SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc, + SourceLocation EndLoc) { if (Allocator) { // Allocator expression is dependent - skip it for now and build the // allocator when instantiated. - if (Allocator->isTypeDependent() || Allocator->isValueDependent() || - Allocator->isInstantiationDependent() || - Allocator->containsUnexpandedParameterPack()) - return nullptr; - // OpenMP [2.11.4 allocate Clause, Description] - // allocator is an expression of omp_allocator_handle_t type. - if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack)) - return nullptr; + bool AllocDependent = + (Allocator->isTypeDependent() || Allocator->isValueDependent() || + Allocator->isInstantiationDependent() || + Allocator->containsUnexpandedParameterPack()); + if (!AllocDependent) { + // OpenMP [2.11.4 allocate Clause, Description] + // allocator is an expression of omp_allocator_handle_t type. + if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack)) + return nullptr; - ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator); - if (AllocatorRes.isInvalid()) - return nullptr; - AllocatorRes = SemaRef.PerformImplicitConversion( - AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(), - AssignmentAction::Initializing, - /*AllowExplicit=*/true); - if (AllocatorRes.isInvalid()) - return nullptr; - Allocator = AllocatorRes.get(); + ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator); + if (AllocatorRes.isInvalid()) + return nullptr; + AllocatorRes = SemaRef.PerformImplicitConversion( + AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(), + AssignmentAction::Initializing, + /*AllowExplicit=*/true); + if (AllocatorRes.isInvalid()) + return nullptr; + Allocator = AllocatorRes.isUsable() ? AllocatorRes.get() : nullptr; + } } else { // OpenMP 5.0, 2.11.4 allocate Clause, Restrictions. // allocate clauses that appear on a target construct or on constructs in a @@ -23199,6 +23221,17 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( !DSAStack->hasRequiresDeclWithClause()) SemaRef.targetDiag(StartLoc, diag::err_expected_allocator_expression); } + if (Alignment) { + bool AlignmentDependent = Alignment->isTypeDependent() || + Alignment->isValueDependent() || + Alignment->isInstantiationDependent() || + Alignment->containsUnexpandedParameterPack(); + if (!AlignmentDependent) { + ExprResult AlignResult = + VerifyPositiveIntegerConstantInClause(Alignment, OMPC_allocate); + Alignment = AlignResult.isUsable() ? AlignResult.get() : nullptr; + } + } // Analyze and build list of variables. SmallVector Vars; for (Expr *RefExpr : VarList) { @@ -23230,11 +23263,10 @@ OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( if (Allocator) DSAStack->addInnerAllocatorExpr(Allocator); - OpenMPAllocateClauseModifier AllocatorModifier = AllocClauseModifier; - SourceLocation AllocatorModifierLoc; - return OMPAllocateClause::Create(getASTContext(), StartLoc, LParenLoc, - Allocator, ColonLoc, AllocatorModifier, - AllocatorModifierLoc, EndLoc, Vars); + return OMPAllocateClause::Create( + getASTContext(), StartLoc, LParenLoc, Allocator, Alignment, ColonLoc, + FirstAllocateModifier, FirstAllocateModifierLoc, SecondAllocateModifier, + SecondAllocateModifierLoc, EndLoc, Vars); } OMPClause *SemaOpenMP::ActOnOpenMPNontemporalClause(ArrayRef VarList, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 4a3c739ecbeab..4fae2ccb5f6d0 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -2075,15 +2075,18 @@ class TreeTransform { /// /// By default, performs semantic analysis to build the new OpenMP clause. /// Subclasses may override this routine to provide different behavior. - OMPClause *RebuildOMPAllocateClause(Expr *Allocate, - OpenMPAllocateClauseModifier ACModifier, - ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ColonLoc, - SourceLocation EndLoc) { + OMPClause * + RebuildOMPAllocateClause(Expr *Allocate, Expr *Alignment, + OpenMPAllocateClauseModifier FirstModifier, + SourceLocation FirstModifierLoc, + OpenMPAllocateClauseModifier SecondModifier, + SourceLocation SecondModifierLoc, + ArrayRef VarList, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation ColonLoc, + SourceLocation EndLoc) { return getSema().OpenMP().ActOnOpenMPAllocateClause( - Allocate, ACModifier, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc); + Allocate, Alignment, FirstModifier, FirstModifierLoc, SecondModifier, + SecondModifierLoc, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc); } /// Build a new OpenMP 'num_teams' clause. @@ -11224,6 +11227,13 @@ TreeTransform::TransformOMPAllocateClause(OMPAllocateClause *C) { return nullptr; Allocator = AllocatorRes.get(); } + Expr *Alignment = C->getAlignment(); + if (Alignment) { + ExprResult AlignmentRes = getDerived().TransformExpr(Alignment); + if (AlignmentRes.isInvalid()) + return nullptr; + Alignment = AlignmentRes.get(); + } llvm::SmallVector Vars; Vars.reserve(C->varlist_size()); for (auto *VE : C->varlist()) { @@ -11233,7 +11243,9 @@ TreeTransform::TransformOMPAllocateClause(OMPAllocateClause *C) { Vars.push_back(EVar.get()); } return getDerived().RebuildOMPAllocateClause( - Allocator, C->getAllocatorModifier(), Vars, C->getBeginLoc(), + Allocator, Alignment, C->getFirstAllocateModifier(), + C->getFirstAllocateModifierLoc(), C->getSecondAllocateModifier(), + C->getSecondAllocateModifierLoc(), Vars, C->getBeginLoc(), C->getLParenLoc(), C->getColonLoc(), C->getEndLoc()); } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index b53f99732cacc..7361cace49dd7 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -11824,10 +11824,12 @@ void OMPClauseReader::VisitOMPMapClause(OMPMapClause *C) { } void OMPClauseReader::VisitOMPAllocateClause(OMPAllocateClause *C) { - C->setAllocatorModifier(Record.readEnum()); + C->setFirstAllocateModifier(Record.readEnum()); + C->setSecondAllocateModifier(Record.readEnum()); C->setLParenLoc(Record.readSourceLocation()); C->setColonLoc(Record.readSourceLocation()); C->setAllocator(Record.readSubExpr()); + C->setAlignment(Record.readSubExpr()); unsigned NumVars = C->varlist_size(); SmallVector Vars; Vars.reserve(NumVars); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 39004fd4d4c37..345d496a93312 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7924,10 +7924,12 @@ void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) { void OMPClauseWriter::VisitOMPAllocateClause(OMPAllocateClause *C) { Record.push_back(C->varlist_size()); - Record.writeEnum(C->getAllocatorModifier()); + Record.writeEnum(C->getFirstAllocateModifier()); + Record.writeEnum(C->getSecondAllocateModifier()); Record.AddSourceLocation(C->getLParenLoc()); Record.AddSourceLocation(C->getColonLoc()); Record.AddStmt(C->getAllocator()); + Record.AddStmt(C->getAlignment()); for (auto *VE : C->varlist()) Record.AddStmt(VE); } diff --git a/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp b/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp deleted file mode 100644 index 1bf927ebb2eb7..0000000000000 --- a/clang/test/OpenMP/allocate_allocator_modifier_codegen.cpp +++ /dev/null @@ -1,255 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CHECK-TLS %s - -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// expected-no-diagnostics - -#ifndef HEADER -#define HEADER - -enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, - KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__ -}; - -template -struct ST { - static T m; -}; - -template T foo() { - T v; - #pragma omp scope private(v) allocate(allocator(TY):v) - v = ST::m; - return v; -} - -namespace ns { -int a; -} - -int main() { - static int a; - static int temp; - #pragma omp scope private(ns::a) allocate(allocator(omp_pteam_mem_alloc):ns::a) - ns::a++; - - #pragma omp scope private(a) allocate(allocator(omp_thread_mem_alloc):a) - a = 2; - double b = 3; - #pragma omp scope private(temp) allocate(temp) - temp += foo(); - return temp+ns::a; -} - -extern template int ST::m; - -int b; - -void bar(int a, float &z) { - #pragma omp scope private(a,z) allocate(allocator(omp_default_mem_alloc):a,z) - a += b; -} -#endif -// CHECK-LABEL: define dso_local noundef i32 @main( -// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[B:%.*]] = alloca double, align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) -// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) -// CHECK-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 8 to ptr)) -// CHECK-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: store double 3.000000e+00, ptr [[B]], align 8 -// CHECK-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) -// CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v() -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]] -// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 -// CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[TMP4]] -// CHECK-NEXT: ret i32 [[ADD2]] -// -// -// CHECK-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v( -// CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[V:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[V1:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 -// CHECK-NEXT: store i32 [[TMP1]], ptr [[V1]], align 4 -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[V]], align 4 -// CHECK-NEXT: ret i32 [[TMP2]] -// -// -// CHECK-LABEL: define dso_local void @_Z3bariRf( -// CHECK-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR3]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 -// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] -// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: ret void -// -// -// CHECK-TLS-LABEL: define dso_local noundef i32 @main( -// CHECK-TLS-SAME: ) #[[ATTR0:[0-9]+]] { -// CHECK-TLS-NEXT: [[ENTRY:.*:]] -// CHECK-TLS-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[B:%.*]] = alloca double, align 8 -// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) -// CHECK-TLS-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) -// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// CHECK-TLS-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 8 to ptr)) -// CHECK-TLS-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: store double 3.000000e+00, ptr [[B]], align 8 -// CHECK-TLS-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) -// CHECK-TLS-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v() -// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[CALL]] -// CHECK-TLS-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 -// CHECK-TLS-NEXT: [[TMP4:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 -// CHECK-TLS-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[TMP4]] -// CHECK-TLS-NEXT: ret i32 [[ADD2]] -// -// -// CHECK-TLS-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v( -// CHECK-TLS-SAME: ) #[[ATTR3:[0-9]+]] comdat { -// CHECK-TLS-NEXT: [[ENTRY:.*:]] -// CHECK-TLS-NEXT: [[V:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[V1:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 -// CHECK-TLS-NEXT: store i32 [[TMP1]], ptr [[V1]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[V]], align 4 -// CHECK-TLS-NEXT: ret i32 [[TMP2]] -// -// -// CHECK-TLS-LABEL: define dso_local void @_Z3bariRf( -// CHECK-TLS-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR3]] { -// CHECK-TLS-NEXT: [[ENTRY:.*:]] -// CHECK-TLS-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// CHECK-TLS-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-TLS-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) -// CHECK-TLS-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// CHECK-TLS-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 -// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 -// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 4 -// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] -// CHECK-TLS-NEXT: store i32 [[ADD]], ptr [[DOTA__VOID_ADDR]], align 4 -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) -// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-TLS-NEXT: ret void -// -// -// SIMD-ONLY0-LABEL: define dso_local noundef i32 @main( -// SIMD-ONLY0-SAME: ) #[[ATTR0:[0-9]+]] { -// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] -// SIMD-ONLY0-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[A:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[B:%.*]] = alloca double, align 8 -// SIMD-ONLY0-NEXT: [[TEMP:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 -// SIMD-ONLY0-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 -// SIMD-ONLY0-NEXT: store i32 [[INC]], ptr [[A]], align 4 -// SIMD-ONLY0-NEXT: store i32 2, ptr [[A1]], align 4 -// SIMD-ONLY0-NEXT: store double 3.000000e+00, ptr [[B]], align 8 -// SIMD-ONLY0-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v() -// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[TEMP]], align 4 -// SIMD-ONLY0-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]] -// SIMD-ONLY0-NEXT: store i32 [[ADD]], ptr [[TEMP]], align 4 -// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 -// SIMD-ONLY0-NEXT: [[TMP3:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 -// SIMD-ONLY0-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] -// SIMD-ONLY0-NEXT: ret i32 [[ADD2]] -// -// -// SIMD-ONLY0-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6EET_v( -// SIMD-ONLY0-SAME: ) #[[ATTR1:[0-9]+]] comdat { -// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] -// SIMD-ONLY0-NEXT: [[V:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[V1:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 -// SIMD-ONLY0-NEXT: store i32 [[TMP0]], ptr [[V1]], align 4 -// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[V]], align 4 -// SIMD-ONLY0-NEXT: ret i32 [[TMP1]] -// -// -// SIMD-ONLY0-LABEL: define dso_local void @_Z3bariRf( -// SIMD-ONLY0-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR1]] { -// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] -// SIMD-ONLY0-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 -// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 -// SIMD-ONLY0-NEXT: [[Z2:%.*]] = alloca float, align 4 -// SIMD-ONLY0-NEXT: [[TMP:%.*]] = alloca ptr, align 8 -// SIMD-ONLY0-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 -// SIMD-ONLY0-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 -// SIMD-ONLY0-NEXT: store ptr [[Z2]], ptr [[TMP]], align 8 -// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr @b, align 4 -// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[A1]], align 4 -// SIMD-ONLY0-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -// SIMD-ONLY0-NEXT: store i32 [[ADD]], ptr [[A1]], align 4 -// SIMD-ONLY0-NEXT: ret void -// diff --git a/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp b/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp deleted file mode 100644 index 160c4996c1219..0000000000000 --- a/clang/test/OpenMP/allocate_allocator_modifier_messages.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 %s - -typedef enum omp_allocator_handle_t { - omp_null_allocator = 0, - omp_default_mem_alloc = 1, - omp_large_cap_mem_alloc = 2, - omp_const_mem_alloc = 3, - omp_high_bw_mem_alloc = 4, - omp_low_lat_mem_alloc = 5, - omp_cgroup_mem_alloc = 6, - omp_pteam_mem_alloc = 7, - omp_thread_mem_alloc = 8, -} omp_allocator_handle_t; - -int myAlloc() { - return 100; -} - -int main() { - int a, b, c; - // expected-error@+4 {{expected '('}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator - // expected-error@+6 {{expected expression}} - // expected-error@+5 {{expected ')'}} - // expected-note@+4 {{to match this '('}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator( - // expected-error@+4 {{expected expression}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator() - // expected-error@+2 {{expected expression}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator()) - // expected-error@+6 {{expected ')'}} - // expected-note@+5 {{to match this '('}} - // expected-error@+4 {{missing ':' after allocator modifier}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc - // expected-error@+6 {{missing ':' after allocator modifier}} - // expected-error@+5 {{expected expression}} - // expected-error@+4 {{expected ')'}} - // expected-note@+3 {{to match this '('}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_large_cap_mem_alloc: - // expected-error@+4 {{missing ':' after allocator modifier}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc) - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_high_bw_mem_alloc)) - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_low_lat_mem_alloc):) - // expected-error@+6 {{expected ')'}} - // expected-note@+5 {{to match this '('}} - // expected-error@+4 {{missing ':' after allocator modifier}} - // expected-error@+3 {{expected expression}} - // expected-error@+2 {{expected ')'}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(c) allocate(allocator(omp_cgroup_mem_alloc:) - // expected-error@+4 {{expected ')'}} - // expected-note@+3 {{to match this '('}} - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_pteam_mem_alloc:)) - // expected-error@+4 {{expected ')'}} - // expected-note@+3 {{to match this '('}} - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_thread_mem_alloc:c)) - // expected-error@+1 {{expected variable name}} - #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):1) - // expected-error@+1 {{expected variable name}} - #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):-10) - // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}} - // expected-error@+3 {{expected ')'}} - // expected-warning@+2 {{extra tokens at the end of '#pragma omp scope' are ignored}} - // expected-note@+1 {{to match this '('}} - #pragma omp scope private(a,b,c) allocate(allocator(omp_const_mem_alloc):c:b;a) - // expected-error@+1 {{initializing 'const omp_allocator_handle_t' with an expression of incompatible type 'int'}} - #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c) - // expected-error@+2 {{missing ':' after allocator modifier}} - // expected-error@+1 {{expected expression}} - #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc);c) - ++a; -} diff --git a/clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp b/clang/test/OpenMP/allocate_modifiers_ast_print.cpp similarity index 51% rename from clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp rename to clang/test/OpenMP/allocate_modifiers_ast_print.cpp index 15f3f1dd9bbb9..436647be75da3 100644 --- a/clang/test/OpenMP/allocate_allocator_modifier_ast_print.cpp +++ b/clang/test/OpenMP/allocate_modifiers_ast_print.cpp @@ -41,6 +41,11 @@ int main() { #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c) c++; #pragma omp scope private(c,a,b,d) allocate(myAlloc():a,b,c,d) + a++; + #pragma omp scope private(a,b) allocate(align(2), allocator(omp_const_mem_alloc):a,b) + b++; + #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()), align(8) :a,b,c) + c++; // DUMP: FunctionDecl {{.*}} // DUMP: DeclRefExpr {{.*}}'omp_allocator_handle_t' EnumConstant {{.*}}'omp_large_cap_mem_alloc' 'omp_allocator_handle_t' // DUMP: FunctionDecl {{.*}} @@ -76,11 +81,81 @@ int main() { // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int' // DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'d' 'int' +// DUMP: OMPScopeDirective {{.*}} +// DUMP: OMPPrivateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: OMPAllocateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: OMPScopeDirective {{.*}} +// DUMP: OMPPrivateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: OMPAllocateClause {{.*}} +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'a' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'b' 'int' +// DUMP: DeclRefExpr {{.*}}'int' lvalue Var {{.*}}'c' 'int' // PRINT: #pragma omp scope private(a) allocate(omp_const_mem_alloc: a) // PRINT: #pragma omp scope private(a,b) allocate(allocator(omp_const_mem_alloc): a,b) // PRINT: #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()): a,b,c) // PRINT: #pragma omp scope private(c,a,b,d) allocate(myAlloc(): a,b,c,d) - d++; +// PRINT: #pragma omp scope private(a,b) allocate(align(2), allocator(omp_const_mem_alloc): a,b) +// PRINT: #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()), align(8): a,b,c) return a+b+c+d; } + +template +void templated_func(T n) { + int a, b; + T mem = n; + #pragma omp scope private(mem,a,b) allocate(allocator(n),align(al):mem,a,b) + a += b; + #pragma omp scope allocate(allocator(n),align(al):mem,a,b) private(mem,a,b) + a += b; +} + +void template_inst(int n) { + templated_func(omp_const_mem_alloc); + return; +} +// DUMP: FunctionTemplateDecl{{.*}}templated_func +// DUMP: FunctionDecl{{.*}}templated_func 'void (T)' +// DUMP: OMPScopeDirective +// DUMP: OMPPrivateClause +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'T' lvalue Var{{.*}}'mem' 'T' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPScopeDirective +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'T' lvalue Var{{.*}}'mem' 'T' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPPrivateClause + +// DUMP: FunctionDecl{{.*}}used templated_func 'void (omp_allocator_handle_t)' implicit_instantiation +// DUMP: TemplateArgument type 'omp_allocator_handle_t' +// DUMP: EnumType{{.*}}'omp_allocator_handle_t' +// DUMP: Enum{{.*}}'omp_allocator_handle_t' +// DUMP: TemplateArgument integral '4U' + +// DUMP: OMPScopeDirective +// DUMP: OMPPrivateClause +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'omp_allocator_handle_t' lvalue Var{{.*}}'mem' 'omp_allocator_handle_t' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPScopeDirective +// DUMP: OMPAllocateClause +// DUMP: DeclRefExpr{{.*}}'omp_allocator_handle_t' lvalue Var{{.*}}'mem' 'omp_allocator_handle_t' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'a' 'int' +// DUMP: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'b' 'int' +// DUMP: OMPPrivateClause +// PRINT: #pragma omp scope private(mem,a,b) allocate(allocator(n), align(al): mem,a,b) +// PRINT: #pragma omp scope allocate(allocator(n), align(al): mem,a,b) private(mem,a,b) +// PRINT: #pragma omp scope private(mem,a,b) allocate(allocator(n), align(4U): mem,a,b) +// PRINT: #pragma omp scope allocate(allocator(n), align(4U): mem,a,b) private(mem,a,b) + #endif diff --git a/clang/test/OpenMP/allocate_modifiers_codegen.cpp b/clang/test/OpenMP/allocate_modifiers_codegen.cpp new file mode 100644 index 0000000000000..d798e9b3435f0 --- /dev/null +++ b/clang/test/OpenMP/allocate_modifiers_codegen.cpp @@ -0,0 +1,409 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CHECK-TLS %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=52 -fnoopenmp-use-tls -triple x86_64-unknown-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__ +}; + +template +struct ST { + static T m; +}; + +template T foo() { + T v; + #pragma omp scope private(v) allocate(allocator(TY):v) + v = ST::m; + #pragma omp scope private(v) allocate(align(al), allocator(TY):v) + ++v; + return v; +} + +namespace ns { +int a; +} + +omp_allocator_handle_t foo(); + +int main() { + static int a; + static int temp; + int v; + #pragma omp scope private(ns::a) allocate(allocator(omp_pteam_mem_alloc):ns::a) + ns::a++; + #pragma omp scope private(a) allocate(align(8),allocator(omp_thread_mem_alloc):a) + a = 2; + #pragma omp scope private(v) allocate(align(1) : v) + ++v; + #pragma omp scope private(v) allocate(allocator(omp_default_mem_alloc) : v) + ++v; + #pragma omp scope private(v) allocate(allocator(omp_large_cap_mem_alloc), align(8) : v) + ++v; + #pragma omp scope private(v) allocate(align(4) : v) + ++v; + #pragma omp scope private(v) allocate(align(2), allocator(omp_default_mem_alloc) : v) + ++v; + #pragma omp scope private(v) allocate(align(8), allocator(foo()) : v) + ++v; + + double b = 3; + #pragma omp scope private(temp) allocate(temp) + temp += foo(); + return temp+ns::a; +} + +extern template int ST::m; + +const int b = 8; + +void bar(int a, float &z) { + #pragma omp scope private(a,z) allocate(align(b), allocator(omp_default_mem_alloc) : a,z) + a += b + z; +} +#endif +// CHECK-LABEL: define dso_local noundef i32 @main( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B:%.*]] = alloca double, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 8 to ptr)) +// CHECK-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-NEXT: store i32 [[INC2]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr null) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR3:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-NEXT: [[INC4:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK-NEXT: store i32 [[INC4]], ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR3]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR5:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 2 to ptr)) +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-NEXT: [[INC6:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC6]], ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR5]], ptr inttoptr (i64 2 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR7:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-NEXT: [[INC8:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-NEXT: store i32 [[INC8]], ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR7]], ptr null) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR9:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-NEXT: [[INC10:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK-NEXT: store i32 [[INC10]], ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR9]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[CALL:%.*]] = call noundef i64 @_Z3foov() +// CHECK-NEXT: [[CONV:%.*]] = inttoptr i64 [[CALL]] to ptr +// CHECK-NEXT: [[DOTV__VOID_ADDR11:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr [[CONV]]) +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-NEXT: [[INC12:%.*]] = add nsw i32 [[TMP7]], 1 +// CHECK-NEXT: store i32 [[INC12]], ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-NEXT: [[CALL13:%.*]] = call noundef i64 @_Z3foov() +// CHECK-NEXT: [[CONV14:%.*]] = inttoptr i64 [[CALL13]] to ptr +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR11]], ptr [[CONV14]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: store double 3.000000e+00, ptr [[B]], align 8 +// CHECK-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) +// CHECK-NEXT: [[CALL15:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v() +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[CALL15]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 +// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP9]], [[TMP10]] +// CHECK-NEXT: ret i32 [[ADD16]] +// +// +// CHECK-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v( +// CHECK-SAME: ) #[[ATTR4:[0-9]+]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[DOTV__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR1]], ptr inttoptr (i64 6 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[V]], align 4 +// CHECK-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-LABEL: define dso_local void @_Z3bariRf( +// CHECK-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR4]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 +// CHECK-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP2]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP3]] to float +// CHECK-NEXT: [[ADD1:%.*]] = fadd float [[CONV]], [[ADD]] +// CHECK-NEXT: [[CONV2:%.*]] = fptosi float [[ADD1]] to i32 +// CHECK-NEXT: store i32 [[CONV2]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-TLS-LABEL: define dso_local noundef i32 @main( +// CHECK-TLS-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-TLS-NEXT: [[ENTRY:.*:]] +// CHECK-TLS-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[B:%.*]] = alloca double, align 8 +// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// CHECK-TLS-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 7 to ptr)) +// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CHECK-TLS-NEXT: store i32 [[INC]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 7 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 8 to ptr)) +// CHECK-TLS-NEXT: store i32 2, ptr [[DOTA__VOID_ADDR1]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR1]], ptr inttoptr (i64 8 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-TLS-NEXT: store i32 [[INC2]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr null) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR3:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-TLS-NEXT: [[INC4:%.*]] = add nsw i32 [[TMP3]], 1 +// CHECK-TLS-NEXT: store i32 [[INC4]], ptr [[DOTV__VOID_ADDR3]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR3]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR5:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 2 to ptr)) +// CHECK-TLS-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-TLS-NEXT: [[INC6:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-TLS-NEXT: store i32 [[INC6]], ptr [[DOTV__VOID_ADDR5]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR5]], ptr inttoptr (i64 2 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR7:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr null) +// CHECK-TLS-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-TLS-NEXT: [[INC8:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-TLS-NEXT: store i32 [[INC8]], ptr [[DOTV__VOID_ADDR7]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR7]], ptr null) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR9:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-TLS-NEXT: [[INC10:%.*]] = add nsw i32 [[TMP6]], 1 +// CHECK-TLS-NEXT: store i32 [[INC10]], ptr [[DOTV__VOID_ADDR9]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR9]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[CALL:%.*]] = call noundef i64 @_Z3foov() +// CHECK-TLS-NEXT: [[CONV:%.*]] = inttoptr i64 [[CALL]] to ptr +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR11:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr [[CONV]]) +// CHECK-TLS-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-TLS-NEXT: [[INC12:%.*]] = add nsw i32 [[TMP7]], 1 +// CHECK-TLS-NEXT: store i32 [[INC12]], ptr [[DOTV__VOID_ADDR11]], align 4 +// CHECK-TLS-NEXT: [[CALL13:%.*]] = call noundef i64 @_Z3foov() +// CHECK-TLS-NEXT: [[CONV14:%.*]] = inttoptr i64 [[CALL13]] to ptr +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR11]], ptr [[CONV14]]) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: store double 3.000000e+00, ptr [[B]], align 8 +// CHECK-TLS-NEXT: [[DOTTEMP__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr null) +// CHECK-TLS-NEXT: [[CALL15:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v() +// CHECK-TLS-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[CALL15]] +// CHECK-TLS-NEXT: store i32 [[ADD]], ptr [[DOTTEMP__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTTEMP__VOID_ADDR]], ptr null) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[TMP9:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 +// CHECK-TLS-NEXT: [[TMP10:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 +// CHECK-TLS-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP9]], [[TMP10]] +// CHECK-TLS-NEXT: ret i32 [[ADD16]] +// +// +// CHECK-TLS-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v( +// CHECK-TLS-SAME: ) #[[ATTR4:[0-9]+]] comdat { +// CHECK-TLS-NEXT: [[ENTRY:.*:]] +// CHECK-TLS-NEXT: [[V:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP0]], i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: [[TMP1:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 +// CHECK-TLS-NEXT: store i32 [[TMP1]], ptr [[DOTV__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR]], ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[DOTV__VOID_ADDR1:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-TLS-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CHECK-TLS-NEXT: store i32 [[INC]], ptr [[DOTV__VOID_ADDR1]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTV__VOID_ADDR1]], ptr inttoptr (i64 6 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr [[V]], align 4 +// CHECK-TLS-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-TLS-LABEL: define dso_local void @_Z3bariRf( +// CHECK-TLS-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR4]] { +// CHECK-TLS-NEXT: [[ENTRY:.*:]] +// CHECK-TLS-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-TLS-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-TLS-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-TLS-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK-TLS-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK-TLS-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 +// CHECK-TLS-NEXT: [[DOTA__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: [[DOTZ__VOID_ADDR:%.*]] = call ptr @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 4, ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: store ptr [[DOTZ__VOID_ADDR]], ptr [[TMP]], align 8 +// CHECK-TLS-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-TLS-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 +// CHECK-TLS-NEXT: [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP2]] +// CHECK-TLS-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP3]] to float +// CHECK-TLS-NEXT: [[ADD1:%.*]] = fadd float [[CONV]], [[ADD]] +// CHECK-TLS-NEXT: [[CONV2:%.*]] = fptosi float [[ADD1]] to i32 +// CHECK-TLS-NEXT: store i32 [[CONV2]], ptr [[DOTA__VOID_ADDR]], align 4 +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTZ__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTA__VOID_ADDR]], ptr inttoptr (i64 1 to ptr)) +// CHECK-TLS-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-TLS-NEXT: ret void +// +// +// SIMD-ONLY0-LABEL: define dso_local noundef i32 @main( +// SIMD-ONLY0-SAME: ) #[[ATTR0:[0-9]+]] { +// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY0-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[A:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V2:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V4:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V6:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V8:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V10:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V12:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[B:%.*]] = alloca double, align 8 +// SIMD-ONLY0-NEXT: [[TEMP:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// SIMD-ONLY0-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC]], ptr [[A]], align 4 +// SIMD-ONLY0-NEXT: store i32 2, ptr [[A1]], align 4 +// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[INC3:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC3]], ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr [[V4]], align 4 +// SIMD-ONLY0-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC5]], ptr [[V4]], align 4 +// SIMD-ONLY0-NEXT: [[TMP3:%.*]] = load i32, ptr [[V6]], align 4 +// SIMD-ONLY0-NEXT: [[INC7:%.*]] = add nsw i32 [[TMP3]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC7]], ptr [[V6]], align 4 +// SIMD-ONLY0-NEXT: [[TMP4:%.*]] = load i32, ptr [[V8]], align 4 +// SIMD-ONLY0-NEXT: [[INC9:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC9]], ptr [[V8]], align 4 +// SIMD-ONLY0-NEXT: [[TMP5:%.*]] = load i32, ptr [[V10]], align 4 +// SIMD-ONLY0-NEXT: [[INC11:%.*]] = add nsw i32 [[TMP5]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC11]], ptr [[V10]], align 4 +// SIMD-ONLY0-NEXT: [[TMP6:%.*]] = load i32, ptr [[V12]], align 4 +// SIMD-ONLY0-NEXT: [[INC13:%.*]] = add nsw i32 [[TMP6]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC13]], ptr [[V12]], align 4 +// SIMD-ONLY0-NEXT: store double 3.000000e+00, ptr [[B]], align 8 +// SIMD-ONLY0-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v() +// SIMD-ONLY0-NEXT: [[TMP7:%.*]] = load i32, ptr [[TEMP]], align 4 +// SIMD-ONLY0-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[CALL]] +// SIMD-ONLY0-NEXT: store i32 [[ADD]], ptr [[TEMP]], align 4 +// SIMD-ONLY0-NEXT: [[TMP8:%.*]] = load i32, ptr @_ZZ4mainE4temp, align 4 +// SIMD-ONLY0-NEXT: [[TMP9:%.*]] = load i32, ptr @_ZN2ns1aE, align 4 +// SIMD-ONLY0-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP8]], [[TMP9]] +// SIMD-ONLY0-NEXT: ret i32 [[ADD14]] +// +// +// SIMD-ONLY0-LABEL: define linkonce_odr noundef i32 @_Z3fooIiL22omp_allocator_handle_t6ELj8EET_v( +// SIMD-ONLY0-SAME: ) #[[ATTR1:[0-9]+]] comdat { +// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY0-NEXT: [[V:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V1:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[V2:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr @_ZN2STIiE1mE, align 4 +// SIMD-ONLY0-NEXT: store i32 [[TMP0]], ptr [[V1]], align 4 +// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY0-NEXT: store i32 [[INC]], ptr [[V2]], align 4 +// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr [[V]], align 4 +// SIMD-ONLY0-NEXT: ret i32 [[TMP2]] +// +// +// SIMD-ONLY0-LABEL: define dso_local void @_Z3bariRf( +// SIMD-ONLY0-SAME: i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Z:%.*]]) #[[ATTR1]] { +// SIMD-ONLY0-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY0-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[Z_ADDR:%.*]] = alloca ptr, align 8 +// SIMD-ONLY0-NEXT: [[A1:%.*]] = alloca i32, align 4 +// SIMD-ONLY0-NEXT: [[Z2:%.*]] = alloca float, align 4 +// SIMD-ONLY0-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// SIMD-ONLY0-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY0-NEXT: store ptr [[Z]], ptr [[Z_ADDR]], align 8 +// SIMD-ONLY0-NEXT: store ptr [[Z2]], ptr [[TMP]], align 8 +// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8 +// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4 +// SIMD-ONLY0-NEXT: [[ADD:%.*]] = fadd float 8.000000e+00, [[TMP1]] +// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr [[A1]], align 4 +// SIMD-ONLY0-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP2]] to float +// SIMD-ONLY0-NEXT: [[ADD3:%.*]] = fadd float [[CONV]], [[ADD]] +// SIMD-ONLY0-NEXT: [[CONV4:%.*]] = fptosi float [[ADD3]] to i32 +// SIMD-ONLY0-NEXT: store i32 [[CONV4]], ptr [[A1]], align 4 +// SIMD-ONLY0-NEXT: ret void +// diff --git a/clang/test/OpenMP/allocate_modifiers_messages.cpp b/clang/test/OpenMP/allocate_modifiers_messages.cpp new file mode 100644 index 0000000000000..6867e78a89ee9 --- /dev/null +++ b/clang/test/OpenMP/allocate_modifiers_messages.cpp @@ -0,0 +1,159 @@ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 %s + +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, +} omp_allocator_handle_t; + +int myAlloc() { + return 100; +} + +int main() { + int a, b, c; + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator + // expected-error@+6 {{expected expression}} + // expected-error@+5 {{expected ')'}} + // expected-note@+4 {{to match this '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator( + // expected-error@+4 {{expected expression}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator() + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator()) + // expected-error@+6 {{expected ')'}} + // expected-note@+5 {{to match this '('}} + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc + // expected-error@+6 {{missing ':' after allocate clause modifier}} + // expected-error@+5 {{expected expression}} + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_large_cap_mem_alloc: + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc) + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_high_bw_mem_alloc)) + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_low_lat_mem_alloc):) + // expected-error@+6 {{expected ')'}} + // expected-note@+5 {{to match this '('}} + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(c) allocate(allocator(omp_cgroup_mem_alloc:) + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_pteam_mem_alloc:)) + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_thread_mem_alloc:c)) + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):1) + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(c) allocate(allocator(omp_const_mem_alloc):-10) + // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+3 {{expected ')'}} + // expected-warning@+2 {{extra tokens at the end of '#pragma omp scope' are ignored}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a,b,c) allocate(allocator(omp_const_mem_alloc):c:b;a) + // expected-error@+1 {{initializing 'const omp_allocator_handle_t' with an expression of incompatible type 'int'}} + #pragma omp scope private(c,a,b) allocate(allocator(myAlloc()):a,b,c) + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(c) allocate(allocator(omp_default_mem_alloc);c) + // expected-error@+2 {{duplicate modifier 'allocator' in 'allocate' clause}} + // expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}} + #pragma omp scope private(a) allocate(allocator(omp_default_mem_alloc), allocator(omp_default_mem_alloc), align(3) : a) + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(allocator + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(b) allocate(align + // expected-error@+1 {{duplicate modifier 'align' in 'allocate' clause}} + #pragma omp scope private(a) allocate(align(8), align(4) : a) + // expected-error@+5 {{use of undeclared identifier 'align'}} + // expected-error@+4 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+3 {{expected ')'}} + // expected-note@+2 {{to match this '('}} + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(a) allocate(omp_default_mem_alloc, align(8) : a) + // expected-error@+3 {{expected modifier in 'allocate' clause}} + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(align(8), omp_default_mem_alloc : a) + // expected-error@+5 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+4 {{expected ')'}} + // expected-note@+3 {{to match this '('}} + // expected-error@+2 {{expected variable name}} + // expected-error@+1 {{expected variable name}} + #pragma omp scope private(a) allocate(omp_default_mem_alloc, omp_default_mem_alloc : a) + // expected-error@+2 {{use of undeclared identifier 'undefinedVar'}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(undefinedVar : a) + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) : ) + // expected-error@+2 {{missing ':' after allocate clause modifier}} + // expected-error@+1 {{expected expression}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) ) + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) : + + // expected-error@+4 {{missing ':' after allocate clause modifier}} + // expected-error@+3 {{expected expression}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(align(8), allocator(omp_default_mem_alloc) + // expected-error@+4 {{expected '('}} + // expected-error@+3 {{expected '('}} + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{use of undeclared identifier 'allocator'}} + #pragma omp scope private(a) allocate(align, allocator : ) + // expected-error@+7 {{expected expression}} + // expected-error@+6 {{expected expression}} + // expected-error@+5 {{expected expression}} + // expected-error@+4 {{use of undeclared identifier 'allocator'}} + // expected-error@+3 {{expected ',' or ')' in 'allocate' clause}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp scope private(a) allocate(align(), allocator() : ) + ++a; +} From 91de4ce3b8878d06f6a6f0ed7cd844008107c460 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Mon, 13 Jan 2025 13:51:52 +0000 Subject: [PATCH 055/102] [Multilib] Custom flags YAML parsing (#110657) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch is the first step to extend the current multilib system to support the selection of library variants which do not correspond to existing command-line options. Proposal can be found in https://discourse.llvm.org/t/rfc-multilib-custom-flags/81058 The multilib mechanism supports libraries that target code generation or language options such as `--target`, `-mcpu`, `-mfpu`, `-mbranch-protection`. However, some library variants are particular to features that do not correspond to any command-line options. Examples include variants for multithreading and semihosting. This work introduces a way to instruct the multilib system to consider these features in library selection. This particular patch comprises a new section in `multilib.yaml` to declare flags for which no option exists. Henceforth this sort of flag will be called `custom flag` for clarity. The `multilib.yaml` file will have a new section called Flags which contains the declarations of the target’s custom flags: ```yaml Flags: - Name: multithreaded Values: - Name: no-multithreaded MacroDefines: [__SINGLE_THREAD__] - Name: multithreaded Default: no-multithreaded - Name: io Values: - Name: io-none - Name: io-semihosting MacroDefines: [SEMIHOSTING] - Name: io-linux-syscalls MacroDefines: [LINUX_SYSCALLS, HOSTED=1] Default: io-none ``` - Name: the name to categorize a flag. - Values: a list of possible values. - Default: it specifies which value this flag should take if not specified in the command-line invocation. It must be one value from the Values field. Each flag Value follows this description: - Name (required): the name of the custom flag value (string). This is the string to be used in `-fmultilib-flag=`. - MacroDefines (optional): a list of strings to be used as macro definitions. Each string is fed into the driver as ``-D``. A Default value is useful to save users from specifying custom flags that have a most commonly used value. The namespace of flag values is common across all flags. This means that flag values must be unique. --- clang/include/clang/Driver/Multilib.h | 28 +++- clang/lib/Driver/Multilib.cpp | 73 ++++++++-- ...remetal-multilib-custom-flags-parsing.yaml | 133 ++++++++++++++++++ 3 files changed, 223 insertions(+), 11 deletions(-) create mode 100644 clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h index dbed70f4f9008..1dab45c062aee 100644 --- a/clang/include/clang/Driver/Multilib.h +++ b/clang/include/clang/Driver/Multilib.h @@ -101,6 +101,25 @@ class Multilib { raw_ostream &operator<<(raw_ostream &OS, const Multilib &M); +namespace custom_flag { +struct Declaration; +using DeclarationPtr = std::shared_ptr; + +struct ValueDetail { + std::string Name; + std::optional> MacroDefines; + DeclarationPtr Decl; +}; + +struct Declaration { + std::string Name; + SmallVector ValueList; + std::optional DefaultValueIdx; +}; + +static constexpr StringRef Prefix = "-fmultilib-flag="; +} // namespace custom_flag + /// See also MultilibSetBuilder for combining multilibs into a set. class MultilibSet { public: @@ -120,15 +139,18 @@ class MultilibSet { private: multilib_list Multilibs; - std::vector FlagMatchers; + SmallVector FlagMatchers; + SmallVector CustomFlagDecls; IncludeDirsFunc IncludeCallback; IncludeDirsFunc FilePathsCallback; public: MultilibSet() = default; MultilibSet(multilib_list &&Multilibs, - std::vector &&FlagMatchers = {}) - : Multilibs(Multilibs), FlagMatchers(FlagMatchers) {} + SmallVector &&FlagMatchers = {}, + SmallVector &&CustomFlagDecls = {}) + : Multilibs(std::move(Multilibs)), FlagMatchers(std::move(FlagMatchers)), + CustomFlagDecls(std::move(CustomFlagDecls)) {} const multilib_list &getMultilibs() { return Multilibs; } diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp index 0207e0f2eb2de..b4b5dbd1bdb5e 100644 --- a/clang/lib/Driver/Multilib.cpp +++ b/clang/lib/Driver/Multilib.cpp @@ -10,6 +10,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Driver/Driver.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" @@ -201,13 +202,20 @@ struct MultilibGroupSerialization { struct MultilibSetSerialization { llvm::VersionTuple MultilibVersion; - std::vector Groups; - std::vector Multilibs; - std::vector FlagMatchers; + SmallVector Groups; + SmallVector Multilibs; + SmallVector FlagMatchers; + SmallVector CustomFlagDeclarations; }; } // end anonymous namespace +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) +LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::ValueDetail) +LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::DeclarationPtr) + template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSerialization &V) { io.mapOptional("Dir", V.Dir); @@ -255,11 +263,63 @@ template <> struct llvm::yaml::MappingTraits { } }; +template <> +struct llvm::yaml::MappingContextTraits> { + static void mapping(llvm::yaml::IO &io, custom_flag::ValueDetail &V, + llvm::SmallSet &) { + io.mapRequired("Name", V.Name); + io.mapOptional("MacroDefines", V.MacroDefines); + } + static std::string validate(IO &io, custom_flag::ValueDetail &V, + llvm::SmallSet &NameSet) { + if (V.Name.empty()) + return "custom flag value requires a name"; + if (!NameSet.insert(V.Name).second) + return "duplicate custom flag value name: \"" + V.Name + "\""; + return {}; + } +}; + +template <> +struct llvm::yaml::MappingContextTraits> { + static void mapping(llvm::yaml::IO &io, custom_flag::DeclarationPtr &V, + llvm::SmallSet &NameSet) { + assert(!V); + V = std::make_shared(); + io.mapRequired("Name", V->Name); + io.mapRequired("Values", V->ValueList, NameSet); + std::string DefaultValueName; + io.mapRequired("Default", DefaultValueName); + + for (auto [Idx, Value] : llvm::enumerate(V->ValueList)) { + Value.Decl = V; + if (Value.Name == DefaultValueName) { + assert(!V->DefaultValueIdx); + V->DefaultValueIdx = Idx; + } + } + } + static std::string validate(IO &io, custom_flag::DeclarationPtr &V, + llvm::SmallSet &) { + if (V->Name.empty()) + return "custom flag requires a name"; + if (V->ValueList.empty()) + return "custom flag must have at least one value"; + if (!V->DefaultValueIdx) + return "custom flag must have a default value"; + return {}; + } +}; + template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSetSerialization &M) { io.mapRequired("MultilibVersion", M.MultilibVersion); io.mapRequired("Variants", M.Multilibs); io.mapOptional("Groups", M.Groups); + llvm::SmallSet NameSet; + io.mapOptionalWithContext("Flags", M.CustomFlagDeclarations, NameSet); io.mapOptional("Mappings", M.FlagMatchers); } static std::string validate(IO &io, MultilibSetSerialization &M) { @@ -288,10 +348,6 @@ template <> struct llvm::yaml::MappingTraits { } }; -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) - llvm::ErrorOr MultilibSet::parseYaml(llvm::MemoryBufferRef Input, llvm::SourceMgr::DiagHandlerTy DiagHandler, @@ -319,7 +375,8 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input, } } - return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers)); + return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers), + std::move(MS.CustomFlagDeclarations)); } LLVM_DUMP_METHOD void MultilibSet::dump() const { diff --git a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml new file mode 100644 index 0000000000000..fe6a9a8d7f1ee --- /dev/null +++ b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml @@ -0,0 +1,133 @@ +# RUN: split-file %s %t + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-without-macro-defines.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-with-macro-defines.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s +# CHECK-NOT: error: + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-NAME +# CHECK-MISSING-FLAG-NAME: error: custom flag requires a name + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-values.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUES +# CHECK-MISSING-FLAG-VALUES: error: custom flag must have at least one value + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-default.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-DEFAULT +# CHECK-MISSING-FLAG-VALUE-DEFAULT: error: custom flag must have a default value + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-NAME +# CHECK-MISSING-FLAG-VALUE-NAME: error: custom flag value requires a name + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/duplicate-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-DUPLICATE-FLAG-VALUE-NAME +# CHECK-DUPLICATE-FLAG-VALUE-NAME: error: duplicate custom flag value name: "value-name" +# CHECK-DUPLICATE-FLAG-VALUE-NAME-NEXT: - Name: value-name + +#--- multilib-without-macro-defines.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + - Name: b + Default: a + +#--- multilib-with-macro-defines.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + MacroDefines: [FEATURE_A] + - Name: b + MacroDefines: [FEATURE_B] + Default: a + +#--- missing-flag-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Values: + - Name: a + Default: a + +#--- missing-flag-values.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + Default: a + +#--- missing-flag-value-default.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + Default: + +#--- missing-flag-value-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: + Default: a + +#--- duplicate-flag-value-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=value-name] + +Flags: + - Name: a + Values: + - Name: value-name + - Name: value-a + Default: value-name + - Name: b + Values: + - Name: value-name + Default: value-name From da5f0dea0dca9aa7be0568999a4385d080952d85 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Mon, 13 Jan 2025 13:53:53 +0000 Subject: [PATCH 056/102] [Multilib] Add -fmultilib-flag command-line option (#110658) This patch is the second step to extend the current multilib system to support the selection of library variants which do not correspond to existing command-line options. Proposal can be found in https://discourse.llvm.org/t/rfc-multilib-custom-flags/81058 The multilib mechanism supports libraries that target code generation or language options such as --target, -mcpu, -mfpu, -mbranch-protection. However, some library variants are particular to features that do not correspond to any command-line options. Examples include variants for multithreading and semihosting. This work introduces a way to instruct the multilib system to consider these features in library selection. The driver must be informed about the multilib custom flags with a new command-line option. ``` -fmultilib-flag=C ``` Where the grammar for C is: ``` C -> option option -> multithreaded | no-multithreaded | io-none | io-semihosting | io-linux-syscalls | ... ``` There must be one option instance for each flag specified: ``` -fmultilib-flag=multithreaded -fmultilib-flag=io-semihosting ``` Contradictory options are untied by *last one wins*. These options are to be used exclusively by the multilib mechanism in the Clang driver. Hence they are not forwarded to the compiler frontend. --- clang/include/clang/Driver/Options.td | 2 ++ clang/lib/Driver/ToolChain.cpp | 12 ++++++++++++ clang/test/Driver/print-multi-selection-flags.c | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 80360216c9503..bbf5c0e7e7fd1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5756,6 +5756,8 @@ def print_multi_directory : Flag<["-", "--"], "print-multi-directory">; def print_multi_lib : Flag<["-", "--"], "print-multi-lib">; def print_multi_flags : Flag<["-", "--"], "print-multi-flags-experimental">, HelpText<"Print the flags used for selecting multilibs (experimental)">; +def fmultilib_flag : Joined<["-", "--"], "fmultilib-flag=">, + Visibility<[ClangOption]>; def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">, Flags<[Unsupported]>; def print_target_triple : Flag<["-", "--"], "print-target-triple">, diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 2b4df64f2789d..acf9d264d631b 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -196,6 +196,15 @@ bool ToolChain::defaultToIEEELongDouble() const { return PPC_LINUX_DEFAULT_IEEELONGDOUBLE && getTriple().isOSLinux(); } +static void processMultilibCustomFlags(Multilib::flags_list &List, + const llvm::opt::ArgList &Args) { + for (const Arg *MultilibFlagArg : + Args.filtered(options::OPT_fmultilib_flag)) { + List.push_back(MultilibFlagArg->getAsString(Args)); + MultilibFlagArg->claim(); + } +} + static void getAArch64MultilibFlags(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args, @@ -246,6 +255,8 @@ static void getAArch64MultilibFlags(const Driver &D, if (ABIArg) { Result.push_back(ABIArg->getAsString(Args)); } + + processMultilibCustomFlags(Result, Args); } static void getARMMultilibFlags(const Driver &D, @@ -313,6 +324,7 @@ static void getARMMultilibFlags(const Driver &D, if (Endian->getOption().matches(options::OPT_mbig_endian)) Result.push_back(Endian->getAsString(Args)); } + processMultilibCustomFlags(Result, Args); } static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple, diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c index 5bf6dca5096a7..cf9522aa06852 100644 --- a/clang/test/Driver/print-multi-selection-flags.c +++ b/clang/test/Driver/print-multi-selection-flags.c @@ -90,3 +90,10 @@ // CHECK-RV32E-ORDER: --target=riscv32-unknown-none-elf // CHECK-RV32E-ORDER: -mabi=ilp32e // CHECK-RV32E-ORDER: -march=rv32e{{[0-9]+p[0-9]+}}_c{{[0-9]+p[0-9]+}}_zicsr{{[0-9]+p[0-9]+}} + +// RUN: %clang -print-multi-flags-experimental --target=armv8m.main-none-eabi -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-ARM-MULTILIB-CUSTOM-FLAG %s +// RUN: %clang -print-multi-flags-experimental --target=aarch64-none-eabi -fmultilib-flag=foo -fmultilib-flag=bar | FileCheck --check-prefixes=CHECK-MULTILIB-CUSTOM-FLAG,CHECK-AARCH64-MULTILIB-CUSTOM-FLAG %s +// CHECK-ARM-MULTILIB-CUSTOM-FLAG: --target=thumbv8m.main-unknown-none-eabi +// CHECK-AARCH64-MULTILIB-CUSTOM-FLAG: --target=aarch64-unknown-none-eabi +// CHECK-MULTILIB-CUSTOM-FLAG-DAG: -fmultilib-flag=foo +// CHECK-MULTILIB-CUSTOM-FLAG-DAG: -fmultilib-flag=bar From 3958594e1d11a58a23a18d7c9c2d4f43d9e46cee Mon Sep 17 00:00:00 2001 From: Hui Date: Mon, 13 Jan 2025 14:09:29 +0000 Subject: [PATCH 057/102] [libc++] Replace stable_sort with sort in flat_map (#121431) Fixes #120788 --- libcxx/include/__flat_map/flat_map.h | 8 +-- libcxx/include/module.modulemap | 2 + .../flat.map/container_stability.pass.cpp | 68 ------------------- .../iter_iter_stability.pass.cpp | 66 ------------------ .../insert_range_stability.pass.cpp | 63 ----------------- 5 files changed, 5 insertions(+), 202 deletions(-) delete mode 100644 libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp delete mode 100644 libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp delete mode 100644 libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h index 9fe84250b1204..ab53b7a285ca4 100644 --- a/libcxx/include/__flat_map/flat_map.h +++ b/libcxx/include/__flat_map/flat_map.h @@ -17,7 +17,7 @@ #include <__algorithm/ranges_inplace_merge.h> #include <__algorithm/ranges_lower_bound.h> #include <__algorithm/ranges_partition_point.h> -#include <__algorithm/ranges_stable_sort.h> +#include <__algorithm/ranges_sort.h> #include <__algorithm/ranges_unique.h> #include <__algorithm/ranges_upper_bound.h> #include <__algorithm/remove_if.h> @@ -853,9 +853,7 @@ class flat_map { // is no invariant state to preserve _LIBCPP_HIDE_FROM_ABI void __sort_and_unique() { auto __zv = ranges::views::zip(__containers_.keys, __containers_.values); - // To be consistent with std::map's behaviour, we use stable_sort instead of sort. - // As a result, if there are duplicated keys, the first value in the original order will be taken. - ranges::stable_sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); }); + ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); }); auto __dup_start = ranges::unique(__zv, __key_equiv(__compare_)).begin(); auto __dist = ranges::distance(__zv.begin(), __dup_start); __containers_.keys.erase(__containers_.keys.begin() + __dist, __containers_.keys.end()); @@ -886,7 +884,7 @@ class flat_map { return __compare_(std::get<0>(__p1), std::get<0>(__p2)); }; if constexpr (!_WasSorted) { - ranges::stable_sort(__zv.begin() + __append_start_offset, __end, __compare_key); + ranges::sort(__zv.begin() + __append_start_offset, __end, __compare_key); } else { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT( __is_sorted_and_unique(__containers_.keys | ranges::views::drop(__append_start_offset)), diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 07ab5649ae45c..6800f8b562650 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -760,6 +760,8 @@ module std [system] { module ranges_sort { header "__algorithm/ranges_sort.h" export std.functional.ranges_operations + export std.algorithm.sort + export std.algorithm.make_projected } module ranges_stable_partition { header "__algorithm/ranges_stable_partition.h" diff --git a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp b/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp deleted file mode 100644 index 0d90c3250061f..0000000000000 --- a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/container_stability.pass.cpp +++ /dev/null @@ -1,68 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 - -// - -// flat_map(key_container_type key_cont, mapped_container_type mapped_cont); -// -// libc++ uses stable_sort to ensure that flat_map's behavior matches map's, -// in terms of which duplicate items are kept. -// This tests a conforming extension. - -#include -#include -#include -#include -#include -#include -#include - -#include "test_macros.h" - -struct Mod256 { - bool operator()(int x, int y) const { return (x % 256) < (y % 256); } -}; - -int main(int, char**) { - std::mt19937 randomness; - std::vector values; - std::vector> pairs; - for (int i = 0; i < 200; ++i) { - uint16_t r = randomness(); - values.push_back(r); - pairs.emplace_back(r, r); - } - - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values, Mod256()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values, std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs.begin(), pairs.end()); - std::flat_map fm(values, values, Mod256(), std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - return 0; -} diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp deleted file mode 100644 index 14189840ce660..0000000000000 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/iter_iter_stability.pass.cpp +++ /dev/null @@ -1,66 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 - -// - -// template -// flat_map(InputIterator first, InputIterator last, const key_compare& comp = key_compare()) -// -// libc++ uses stable_sort to ensure that flat_map's behavior matches map's, -// in terms of which duplicate items are kept. -// This tests a conforming extension. - -#include -#include -#include -#include -#include -#include -#include - -#include "test_macros.h" - -struct Mod256 { - bool operator()(int x, int y) const { return (x % 256) < (y % 256); } -}; - -int main(int, char**) { - std::mt19937 randomness; - std::pair pairs[200]; - for (auto& pair : pairs) { - pair = {uint16_t(randomness()), uint16_t(randomness())}; - } - - { - std::map m(pairs, pairs + 200); - std::flat_map fm(pairs, pairs + 200); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs, pairs + 200, std::allocator()); - std::flat_map fm(pairs, pairs + 200, std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs, pairs + 200, Mod256()); - std::flat_map fm(pairs, pairs + 200, Mod256()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - { - std::map m(pairs, pairs + 200, Mod256(), std::allocator()); - std::flat_map fm(pairs, pairs + 200, Mod256(), std::allocator()); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - return 0; -} diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp deleted file mode 100644 index fabcb1d216a78..0000000000000 --- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.modifiers/insert_range_stability.pass.cpp +++ /dev/null @@ -1,63 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 - -// - -// template R> -// void insert_range(R&& rg); -// -// libc++ uses stable_sort to ensure that flat_map's behavior matches map's, -// in terms of which duplicate items are kept. -// This tests a conforming extension. - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "test_macros.h" - -struct Mod256 { - bool operator()(int x, int y) const { return (x % 256) < (y % 256); } -}; - -int main(int, char**) { - { - std::mt19937 randomness; - std::pair pairs[400]; - for (int i = 0; i < 400; ++i) { - uint16_t r = randomness(); - pairs[i] = {r, r}; - } - - std::map m(pairs, pairs + 200); - std::flat_map fm(std::sorted_unique, m.begin(), m.end()); - assert(std::ranges::equal(fm, m)); - - fm.insert_range(std::views::counted(pairs + 200, 200)); - m.insert(pairs + 200, pairs + 400); - assert(fm.size() == m.size()); - LIBCPP_ASSERT(std::ranges::equal(fm, m)); - } - - { - std::vector> v{{1, 2}, {1, 3}}; - std::flat_map m; - m.insert_range(v); - assert(m.size() == 1); - LIBCPP_ASSERT(m[1] == 2); - } - return 0; -} From f86786e76f28898a8403aa9e633091779b661e57 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 13 Jan 2025 09:10:36 -0500 Subject: [PATCH 058/102] [libc++] Pass type information down to __libcpp_allocate (#118837) Currently, places where we call __libcpp_allocate must drop type information on the ground even when they actually have such information available. That is unfortunate since some toolchains and system allocators are able to provide improved security when they know what type is being allocated. This is the purpose of http://wg21.link/p2719, where we introduce a new variant of `operator new` which takes a type in its interface. A different but related issue is that `std::allocator` does not honor any in-class `T::operator new` since it is specified to call the global `::operator new` instead. This patch closes the gap to make it trivial for implementations that provide typed memory allocators to actually benefit from that information in more contexts, and also makes libc++ forward-compatible with future proposals that would fix the existing defects in `std::allocator`. It also makes the internal allocation API higher level by operating on objects instead of operating on bytes of memory. Since this is a widely-used function and making this a template could have an impact on debug info sizes, I tried minimizing the number of templated layers by removing `__do_deallocate_handle_size`, which was easy to replace with a macro (and IMO this leads to cleaner code). --- libcxx/include/CMakeLists.txt | 2 +- libcxx/include/__functional/function.h | 19 ++++-- libcxx/include/__memory/allocator.h | 4 +- .../include/__memory/builtin_new_allocator.h | 67 ------------------- .../__memory/unique_temporary_buffer.h | 2 +- libcxx/include/__new/allocate.h | 44 +++++++----- .../include/__string/constexpr_c_functions.h | 5 +- libcxx/include/__utility/element_count.h | 27 ++++++++ libcxx/include/__utility/small_buffer.h | 4 +- libcxx/include/module.modulemap | 19 ++++-- libcxx/src/memory_resource.cpp | 10 +-- .../support.dynamic/libcpp_deallocate.sh.cpp | 37 +++++----- 12 files changed, 112 insertions(+), 128 deletions(-) delete mode 100644 libcxx/include/__memory/builtin_new_allocator.h create mode 100644 libcxx/include/__utility/element_count.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index f7721b1047b81..e152383a329fe 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -547,7 +547,6 @@ set(files __memory/array_cookie.h __memory/assume_aligned.h __memory/auto_ptr.h - __memory/builtin_new_allocator.h __memory/compressed_pair.h __memory/concepts.h __memory/construct_at.h @@ -880,6 +879,7 @@ set(files __utility/cmp.h __utility/convert_to_integral.h __utility/declval.h + __utility/element_count.h __utility/empty.h __utility/exception_guard.h __utility/exchange.h diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h index 2924f6cad6578..08cb731be9725 100644 --- a/libcxx/include/__functional/function.h +++ b/libcxx/include/__functional/function.h @@ -22,7 +22,6 @@ #include <__memory/allocator.h> #include <__memory/allocator_destructor.h> #include <__memory/allocator_traits.h> -#include <__memory/builtin_new_allocator.h> #include <__memory/compressed_pair.h> #include <__memory/unique_ptr.h> #include <__type_traits/aligned_storage.h> @@ -193,6 +192,13 @@ class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> { } }; +template +struct __deallocating_deleter { + _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const { + std::__libcpp_deallocate<_Tp>(static_cast<_Tp*>(__p), __element_count(1)); + } +}; + template class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> { _Fp __f_; @@ -212,8 +218,9 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> { } _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const { - __builtin_new_allocator::__holder_t __hold = __builtin_new_allocator::__allocate_type<__default_alloc_func>(1); - __default_alloc_func* __res = ::new ((void*)__hold.get()) __default_alloc_func(__f_); + using _Self = __default_alloc_func; + unique_ptr<_Self, __deallocating_deleter<_Self>> __hold(std::__libcpp_allocate<_Self>(__element_count(1))); + _Self* __res = ::new ((void*)__hold.get()) _Self(__f_); (void)__hold.release(); return __res; } @@ -222,7 +229,7 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> { _LIBCPP_HIDE_FROM_ABI static void __destroy_and_delete(__default_alloc_func* __f) { __f->destroy(); - __builtin_new_allocator::__deallocate_type<__default_alloc_func>(__f, 1); + std::__libcpp_deallocate<__default_alloc_func>(__f, __element_count(1)); } }; @@ -668,8 +675,8 @@ class __policy_func<_Rp(_ArgTypes...)> { if (__use_small_storage<_Fun>()) { ::new ((void*)&__buf_.__small) _Fun(std::move(__f)); } else { - __builtin_new_allocator::__holder_t __hold = __builtin_new_allocator::__allocate_type<_Fun>(1); - __buf_.__large = ::new ((void*)__hold.get()) _Fun(std::move(__f)); + unique_ptr<_Fun, __deallocating_deleter<_Fun>> __hold(std::__libcpp_allocate<_Fun>(__element_count(1))); + __buf_.__large = ::new ((void*)__hold.get()) _Fun(std::move(__f)); (void)__hold.release(); } } diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h index a7066885a978a..191a59e6614a0 100644 --- a/libcxx/include/__memory/allocator.h +++ b/libcxx/include/__memory/allocator.h @@ -102,7 +102,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if::v if (__libcpp_is_constant_evaluated()) { return static_cast<_Tp*>(::operator new(__n * sizeof(_Tp))); } else { - return static_cast<_Tp*>(std::__libcpp_allocate(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp))); + return std::__libcpp_allocate<_Tp>(__element_count(__n)); } } @@ -117,7 +117,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if::v if (__libcpp_is_constant_evaluated()) { ::operator delete(__p); } else { - std::__libcpp_deallocate((void*)__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)); + std::__libcpp_deallocate<_Tp>(__p, __element_count(__n)); } } diff --git a/libcxx/include/__memory/builtin_new_allocator.h b/libcxx/include/__memory/builtin_new_allocator.h deleted file mode 100644 index cde1a6025a9a7..0000000000000 --- a/libcxx/include/__memory/builtin_new_allocator.h +++ /dev/null @@ -1,67 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H -#define _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H - -#include <__config> -#include <__cstddef/size_t.h> -#include <__memory/unique_ptr.h> -#include <__new/allocate.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -// __builtin_new_allocator -- A non-templated helper for allocating and -// deallocating memory using __builtin_operator_new and -// __builtin_operator_delete. It should be used in preference to -// `std::allocator` to avoid additional instantiations. -struct __builtin_new_allocator { - struct __builtin_new_deleter { - typedef void* pointer_type; - - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __builtin_new_deleter(size_t __size, size_t __align) - : __size_(__size), __align_(__align) {} - - _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const _NOEXCEPT { - std::__libcpp_deallocate(__p, __size_, __align_); - } - - private: - size_t __size_; - size_t __align_; - }; - - typedef unique_ptr __holder_t; - - _LIBCPP_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align) { - return __holder_t(std::__libcpp_allocate(__s, __align), __builtin_new_deleter(__s, __align)); - } - - _LIBCPP_HIDE_FROM_ABI static void __deallocate_bytes(void* __p, size_t __s, size_t __align) _NOEXCEPT { - std::__libcpp_deallocate(__p, __s, __align); - } - - template - _LIBCPP_NODEBUG _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI static __holder_t __allocate_type(size_t __n) { - return __allocate_bytes(__n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)); - } - - template - _LIBCPP_NODEBUG _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI static void - __deallocate_type(void* __p, size_t __n) _NOEXCEPT { - __deallocate_bytes(__p, __n * sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)); - } -}; - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H diff --git a/libcxx/include/__memory/unique_temporary_buffer.h b/libcxx/include/__memory/unique_temporary_buffer.h index dea7fa8e18728..32a3f0f081c00 100644 --- a/libcxx/include/__memory/unique_temporary_buffer.h +++ b/libcxx/include/__memory/unique_temporary_buffer.h @@ -40,7 +40,7 @@ struct __temporary_buffer_deleter { return; } - std::__libcpp_deallocate_unsized((void*)__ptr, _LIBCPP_ALIGNOF(_Tp)); + std::__libcpp_deallocate_unsized<_Tp>(__ptr); } }; diff --git a/libcxx/include/__new/allocate.h b/libcxx/include/__new/allocate.h index 71dffc1776eff..a64663c09fa35 100644 --- a/libcxx/include/__new/allocate.h +++ b/libcxx/include/__new/allocate.h @@ -14,6 +14,8 @@ #include <__cstddef/size_t.h> #include <__new/align_val_t.h> #include <__new/global_new_delete.h> // for _LIBCPP_HAS_SIZED_DEALLOCATION +#include <__type_traits/type_identity.h> +#include <__utility/element_count.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -47,52 +49,58 @@ _LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) _NOEXCEPT { #endif } -inline _LIBCPP_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __align) { +template +inline _LIBCPP_HIDE_FROM_ABI _Tp* __libcpp_allocate(__element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) { + size_t __size = static_cast(__n) * sizeof(_Tp); #if _LIBCPP_HAS_ALIGNED_ALLOCATION if (__is_overaligned_for_new(__align)) { const align_val_t __align_val = static_cast(__align); - return __libcpp_operator_new(__size, __align_val); + return static_cast<_Tp*>(std::__libcpp_operator_new(__size, __align_val)); } #endif (void)__align; - return __libcpp_operator_new(__size); + return static_cast<_Tp*>(std::__libcpp_operator_new(__size)); } -template -_LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args) _NOEXCEPT { -#if !_LIBCPP_HAS_SIZED_DEALLOCATION - (void)__size; - return std::__libcpp_operator_delete(__ptr, __args...); +#if _LIBCPP_HAS_SIZED_DEALLOCATION +# define _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(...) __VA_ARGS__ #else - return std::__libcpp_operator_delete(__ptr, __size, __args...); +# define _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(...) /* nothing */ #endif -} -inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) _NOEXCEPT { +template +inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate( + __type_identity_t<_Tp>* __ptr, __element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT { + size_t __size = static_cast(__n) * sizeof(_Tp); + (void)__size; #if !_LIBCPP_HAS_ALIGNED_ALLOCATION (void)__align; - return __do_deallocate_handle_size(__ptr, __size); + return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size)); #else if (__is_overaligned_for_new(__align)) { const align_val_t __align_val = static_cast(__align); - return __do_deallocate_handle_size(__ptr, __size, __align_val); + return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size), __align_val); } else { - return __do_deallocate_handle_size(__ptr, __size); + return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size)); } #endif } -inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) _NOEXCEPT { +#undef _LIBCPP_ONLY_IF_SIZED_DEALLOCATION + +template +inline _LIBCPP_HIDE_FROM_ABI void +__libcpp_deallocate_unsized(__type_identity_t<_Tp>* __ptr, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT { #if !_LIBCPP_HAS_ALIGNED_ALLOCATION (void)__align; - return __libcpp_operator_delete(__ptr); + return std::__libcpp_operator_delete(__ptr); #else if (__is_overaligned_for_new(__align)) { const align_val_t __align_val = static_cast(__align); - return __libcpp_operator_delete(__ptr, __align_val); + return std::__libcpp_operator_delete(__ptr, __align_val); } else { - return __libcpp_operator_delete(__ptr); + return std::__libcpp_operator_delete(__ptr); } #endif } diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h index f50eac34a1c05..0bc128b68b579 100644 --- a/libcxx/include/__string/constexpr_c_functions.h +++ b/libcxx/include/__string/constexpr_c_functions.h @@ -25,6 +25,7 @@ #include <__type_traits/is_trivially_copyable.h> #include <__type_traits/is_trivially_lexicographically_comparable.h> #include <__type_traits/remove_cv.h> +#include <__utility/element_count.h> #include <__utility/is_pointer_in_range.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -33,10 +34,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD -// Type used to encode that a function takes an integer that represents a number -// of elements as opposed to a number of bytes. -enum class __element_count : size_t {}; - template inline const bool __is_char_type = false; diff --git a/libcxx/include/__utility/element_count.h b/libcxx/include/__utility/element_count.h new file mode 100644 index 0000000000000..82b05a7bde483 --- /dev/null +++ b/libcxx/include/__utility/element_count.h @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___UTILITY_ELEMENT_COUNT_H +#define _LIBCPP___UTILITY_ELEMENT_COUNT_H + +#include <__config> +#include <__cstddef/size_t.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +// Type used to encode that a function takes an integer that represents a number +// of elements as opposed to a number of bytes. +enum class __element_count : size_t {}; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___UTILITY_ELEMENT_COUNT_H diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h index ff6e7e76f14f5..132a57f0fefab 100644 --- a/libcxx/include/__utility/small_buffer.h +++ b/libcxx/include/__utility/small_buffer.h @@ -68,7 +68,7 @@ class __small_buffer { if constexpr (__fits_in_buffer<_Stored>) { return std::launder(reinterpret_cast<_Stored*>(__buffer_)); } else { - byte* __allocation = static_cast(std::__libcpp_allocate(sizeof(_Stored), alignof(_Stored))); + byte* __allocation = reinterpret_cast(std::__libcpp_allocate<_Stored>(__element_count(1))); std::construct_at(reinterpret_cast(__buffer_), __allocation); return std::launder(reinterpret_cast<_Stored*>(__allocation)); } @@ -77,7 +77,7 @@ class __small_buffer { template _LIBCPP_HIDE_FROM_ABI void __dealloc() noexcept { if constexpr (!__fits_in_buffer<_Stored>) - std::__libcpp_deallocate(*reinterpret_cast(__buffer_), sizeof(_Stored), alignof(_Stored)); + std::__libcpp_deallocate<_Stored>(__get<_Stored>(), __element_count(1)); } template diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 6800f8b562650..e3204820b5c25 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1527,14 +1527,16 @@ module std [system] { module aligned_alloc { header "__memory/aligned_alloc.h" } module allocate_at_least { header "__memory/allocate_at_least.h" } module allocation_guard { header "__memory/allocation_guard.h" } - module allocator { header "__memory/allocator.h" } + module allocator { + header "__memory/allocator.h" + export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108 + } module allocator_arg_t { header "__memory/allocator_arg_t.h" } module allocator_destructor { header "__memory/allocator_destructor.h" } module allocator_traits { header "__memory/allocator_traits.h" } module array_cookie { header "__memory/array_cookie.h" } module assume_aligned { header "__memory/assume_aligned.h" } module auto_ptr { header "__memory/auto_ptr.h" } - module builtin_new_allocator { header "__memory/builtin_new_allocator.h" } module compressed_pair { header "__memory/compressed_pair.h" } module concepts { header "__memory/concepts.h" } module construct_at { header "__memory/construct_at.h" } @@ -1569,6 +1571,7 @@ module std [system] { header "__memory/unique_temporary_buffer.h" export std.memory.unique_ptr export std_core.type_traits.is_constant_evaluated + export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108 } module uses_allocator { header "__memory/uses_allocator.h" } module uses_allocator_construction { header "__memory/uses_allocator_construction.h" } @@ -1604,7 +1607,11 @@ module std [system] { module new { header "new" module align_val_t { header "__new/align_val_t.h" } - module allocate { header "__new/allocate.h" } + module allocate { + header "__new/allocate.h" + export std.utility.element_count // used as part of the API + export * // TODO: Workaround for https://github.com/llvm/llvm-project/issues/120108 + } module destroying_delete_t { header "__new/destroying_delete_t.h" } module exceptions { header "__new/exceptions.h" } module global_new_delete { @@ -1911,7 +1918,10 @@ module std [system] { module string { module char_traits { header "__string/char_traits.h" } - module constexpr_c_functions { header "__string/constexpr_c_functions.h" } + module constexpr_c_functions { + header "__string/constexpr_c_functions.h" + export std.utility.element_count // used as part of the constexpr C function's API + } module extern_template_lists { header "__string/extern_template_lists.h" } module fwd { header "__fwd/string.h" } @@ -2021,6 +2031,7 @@ module std [system] { } module cmp { header "__utility/cmp.h" } module convert_to_integral { header "__utility/convert_to_integral.h" } + module element_count { header "__utility/element_count.h" } module exception_guard { header "__utility/exception_guard.h" } module exchange { header "__utility/exchange.h" } module forward_like { header "__utility/forward_like.h" } diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp index e182e5aa66ef9..e1a9e1a8fac49 100644 --- a/libcxx/src/memory_resource.cpp +++ b/libcxx/src/memory_resource.cpp @@ -41,20 +41,22 @@ static bool is_aligned_to(void* ptr, size_t align) { class _LIBCPP_EXPORTED_FROM_ABI __new_delete_memory_resource_imp : public memory_resource { void* do_allocate(size_t bytes, size_t align) override { #if _LIBCPP_HAS_ALIGNED_ALLOCATION - return std::__libcpp_allocate(bytes, align); + return std::__libcpp_allocate(__element_count(bytes), align); #else if (bytes == 0) bytes = 1; - void* result = std::__libcpp_allocate(bytes, align); + std::byte* result = std::__libcpp_allocate(__element_count(bytes), align); if (!is_aligned_to(result, align)) { - std::__libcpp_deallocate(result, bytes, align); + std::__libcpp_deallocate(result, __element_count(bytes), align); __throw_bad_alloc(); } return result; #endif } - void do_deallocate(void* p, size_t bytes, size_t align) override { std::__libcpp_deallocate(p, bytes, align); } + void do_deallocate(void* p, size_t bytes, size_t align) override { + std::__libcpp_deallocate(static_cast(p), __element_count(bytes), align); + } bool do_is_equal(const memory_resource& other) const noexcept override { return &other == this; } }; diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp index b283c8aa06f0c..7ead65caf9fda 100644 --- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp +++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp @@ -96,34 +96,34 @@ struct alloc_stats { }; alloc_stats stats; -void operator delete(void* p)TEST_NOEXCEPT { +void operator delete(void* p) TEST_NOEXCEPT { ::free(p); stats.plain_called++; stats.last_size = stats.last_align = -1; } #ifndef NO_SIZE -void operator delete(void* p, std::size_t n)TEST_NOEXCEPT { +void operator delete(void* p, std::size_t n) TEST_NOEXCEPT { ::free(p); stats.sized_called++; - stats.last_size = n; + stats.last_size = n; stats.last_align = -1; } #endif #ifndef NO_ALIGN -void operator delete(void* p, std::align_val_t a)TEST_NOEXCEPT { +void operator delete(void* p, std::align_val_t a) TEST_NOEXCEPT { std::__libcpp_aligned_free(p); stats.aligned_called++; stats.last_align = static_cast(a); - stats.last_size = -1; + stats.last_size = -1; } -void operator delete(void* p, std::size_t n, std::align_val_t a)TEST_NOEXCEPT { +void operator delete(void* p, std::size_t n, std::align_val_t a) TEST_NOEXCEPT { std::__libcpp_aligned_free(p); stats.aligned_sized_called++; stats.last_align = static_cast(a); - stats.last_size = n; + stats.last_size = n; } #endif @@ -135,45 +135,45 @@ void test_libcpp_dealloc() { std::size_t over_align_val = TEST_ALIGNOF(std::max_align_t) * 2; #endif std::size_t under_align_val = TEST_ALIGNOF(int); - std::size_t with_size_val = 2; + std::size_t with_size_val = 2; { - std::__libcpp_deallocate_unsized(p, under_align_val); + std::__libcpp_deallocate_unsized(static_cast(p), under_align_val); assert(stats.expect_plain()); } stats.reset(); #if defined(NO_SIZE) && defined(NO_ALIGN) { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_plain()); } stats.reset(); #elif defined(NO_SIZE) { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_align(over_align_val)); } stats.reset(); #elif defined(NO_ALIGN) { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_size(with_size_val)); } stats.reset(); #else { - std::__libcpp_deallocate(p, with_size_val, over_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), over_align_val); assert(stats.expect_size_align(with_size_val, over_align_val)); } stats.reset(); { - std::__libcpp_deallocate_unsized(p, over_align_val); + std::__libcpp_deallocate_unsized(static_cast(p), over_align_val); assert(stats.expect_align(over_align_val)); } stats.reset(); { - std::__libcpp_deallocate(p, with_size_val, under_align_val); + std::__libcpp_deallocate(static_cast(p), std::__element_count(with_size_val), under_align_val); assert(stats.expect_size(with_size_val)); } stats.reset(); @@ -202,13 +202,13 @@ void test_allocator_and_new_match() { stats.reset(); #elif defined(NO_SIZE) stats.reset(); -#if TEST_STD_VER >= 11 +# if TEST_STD_VER >= 11 { int* x = DoNotOptimize(new int(42)); delete x; assert(stats.expect_plain()); } -#endif +# endif stats.reset(); { AlignedType* a = DoNotOptimize(new AlignedType()); @@ -241,8 +241,7 @@ void test_allocator_and_new_match() { { AlignedType* a = DoNotOptimize(new AlignedType()); delete a; - assert(stats.expect_size_align(sizeof(AlignedType), - TEST_ALIGNOF(AlignedType))); + assert(stats.expect_size_align(sizeof(AlignedType), TEST_ALIGNOF(AlignedType))); } stats.reset(); #endif From 6e66ff35cbc7ce811cd511efe95342cf02dbc015 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 13 Jan 2025 09:10:52 -0500 Subject: [PATCH 059/102] [libc++][NFC] Use uint32_t instead of __uint32_t on Apple (#122356) We had a 15 year old occurence of __uint32_t, likely from a time when uint32_t was not available everywhere. --- libcxx/include/__locale | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/__locale b/libcxx/include/__locale index 01c3a2e3456ba..e10eb62fb844b 100644 --- a/libcxx/include/__locale +++ b/libcxx/include/__locale @@ -348,7 +348,7 @@ public: # define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA #elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) # ifdef __APPLE__ - typedef __uint32_t mask; + typedef uint32_t mask; # elif defined(__FreeBSD__) typedef unsigned long mask; # elif defined(__NetBSD__) From e681ed69e391aabd815fffdf198f9c9b3da8b8ea Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Mon, 13 Jan 2025 22:12:25 +0800 Subject: [PATCH 060/102] [libc++] Deprecate extension `packaged_task::result_type` (#122600) This extension is questionable and non-conforming. Perhaps we should deprecate and then remove it. Towards #112856. --- libcxx/docs/ReleaseNotes/20.rst | 4 +++ libcxx/include/future | 20 ++++++------- .../futures/futures.task/type.depr.verify.cpp | 28 +++++++++++++++++++ .../futures/futures.task/types.pass.cpp | 10 +++---- 4 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst index 228c3f3432c29..15940948655d7 100644 --- a/libcxx/docs/ReleaseNotes/20.rst +++ b/libcxx/docs/ReleaseNotes/20.rst @@ -146,6 +146,8 @@ Deprecations and Removals ``__undeclare_reachable`` have been removed from the library. These functions were never implemented in a non-trivial way, making it very unlikely that any binary depends on them. +- Non-conforming extension ``packaged_task::result_type`` is deprecated. It will be removed in LLVM 21. + Upcoming Deprecations and Removals ---------------------------------- @@ -164,6 +166,8 @@ LLVM 21 - The ``_LIBCPP_VERBOSE_ABORT_NOT_NOEXCEPT`` macro will be removed in LLVM 21, making ``std::__libcpp_verbose_abort`` unconditionally ``noexcept``. +- Non-conforming extension ``packaged_task::result_type`` will be removed in LLVM 21. + ABI Affecting Changes --------------------- diff --git a/libcxx/include/future b/libcxx/include/future index d777ed8d6016f..72f3ed5ca5d27 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -1612,11 +1612,11 @@ inline _Rp __packaged_task_function<_Rp(_ArgTypes...)>::operator()(_ArgTypes... template class _LIBCPP_TEMPLATE_VIS packaged_task<_Rp(_ArgTypes...)> { public: - typedef _Rp result_type; // extension + using result_type _LIBCPP_DEPRECATED = _Rp; // extension private: - __packaged_task_function __f_; - promise __p_; + __packaged_task_function<_Rp(_ArgTypes...)> __f_; + promise<_Rp> __p_; public: // construction and destruction @@ -1653,7 +1653,7 @@ public: _LIBCPP_HIDE_FROM_ABI bool valid() const _NOEXCEPT { return __p_.__state_ != nullptr; } // result retrieval - _LIBCPP_HIDE_FROM_ABI future get_future() { return __p_.get_future(); } + _LIBCPP_HIDE_FROM_ABI future<_Rp> get_future() { return __p_.get_future(); } // execution _LIBCPP_HIDE_FROM_ABI void operator()(_ArgTypes... __args); @@ -1700,17 +1700,17 @@ template void packaged_task<_Rp(_ArgTypes...)>::reset() { if (!valid()) __throw_future_error(future_errc::no_state); - __p_ = promise(); + __p_ = promise<_Rp>(); } template class _LIBCPP_TEMPLATE_VIS packaged_task { public: - typedef void result_type; // extension + using result_type _LIBCPP_DEPRECATED = void; // extension private: - __packaged_task_function __f_; - promise __p_; + __packaged_task_function __f_; + promise __p_; public: // construction and destruction @@ -1745,7 +1745,7 @@ public: _LIBCPP_HIDE_FROM_ABI bool valid() const _NOEXCEPT { return __p_.__state_ != nullptr; } // result retrieval - _LIBCPP_HIDE_FROM_ABI future get_future() { return __p_.get_future(); } + _LIBCPP_HIDE_FROM_ABI future get_future() { return __p_.get_future(); } // execution _LIBCPP_HIDE_FROM_ABI void operator()(_ArgTypes... __args); @@ -1804,7 +1804,7 @@ template void packaged_task::reset() { if (!valid()) __throw_future_error(future_errc::no_state); - __p_ = promise(); + __p_ = promise(); } template diff --git a/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp b/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp new file mode 100644 index 0000000000000..4065637e9eb2a --- /dev/null +++ b/libcxx/test/libcxx/thread/futures/futures.task/type.depr.verify.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// UNSUPPORTED: no-threads +// UNSUPPORTED: c++03 + +// + +// template +// class packaged_task +// { +// public: +// typedef R result_type; // extension + +// This libc++ extension is deprecated. See https://github.com/llvm/llvm-project/issues/112856. + +#include +#include + +struct A {}; + +using RA = std::packaged_task::result_type; // expected-warning {{'result_type' is deprecated}} +using RV = std::packaged_task::result_type; // expected-warning {{'result_type' is deprecated}} diff --git a/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp b/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp index 1f17d74513471..659232caa46ec 100644 --- a/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp +++ b/libcxx/test/libcxx/thread/futures/futures.task/types.pass.cpp @@ -19,16 +19,16 @@ // This is a libc++ extension. +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + #include #include -#include "test_macros.h" - struct A {}; -int main(int, char**) -{ - static_assert((std::is_same::result_type, A>::value), ""); +int main(int, char**) { + static_assert((std::is_same::result_type, A>::value), ""); + static_assert((std::is_same::result_type, void>::value), ""); return 0; } From 4fc486d9cfddd2f36f56f35cdc7417268fbe568d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 13 Jan 2025 14:13:55 +0000 Subject: [PATCH 061/102] [X86] Fold VPERMV3(X,M,Y) -> VPERMV(CONCAT(X,Y),WIDEN(M)) iff the CONCAT is free (#122485) This extends the existing fold which concatenates X and Y if they are sequential subvectors extracted from the same source. By using combineConcatVectorOps we can recognise other patterns where X and Y can be concatenated for free (e.g. sequential loads, concatenating repeated instructions etc.), which allows the VPERMV3 fold to be a lot more aggressive. This required combineConcatVectorOps to be extended to fold the additional case of "concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> extract_subvector(x)", similar to the original VPERMV3 fold where "x" was larger than the concat result type. This also exposes more cases where we have repeated vector/subvector loads if they have multiple uses - e.g. where we're loading a ymm and the lo/hi xmm pairs independently - in the past we've always considered this to be relatively benign, but I'm not certain if we should now do more to keep these from splitting? --- llvm/lib/Target/X86/X86ISelLowering.cpp | 51 +- ...d_vector_inreg_of_broadcast_from_memory.ll | 28 +- .../X86/avx512-shuffles/partial_permute.ll | 443 ++++++----- llvm/test/CodeGen/X86/pr97968.ll | 4 +- .../X86/shuffle-strided-with-offset-512.ll | 7 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 10 +- .../vector-interleaved-load-i16-stride-3.ll | 156 ++-- .../vector-interleaved-load-i16-stride-5.ll | 120 ++- .../vector-interleaved-load-i16-stride-6.ll | 550 +++++++------- .../vector-interleaved-load-i16-stride-7.ll | 222 +++--- .../vector-interleaved-load-i16-stride-8.ll | 188 +++-- .../vector-interleaved-load-i32-stride-2.ll | 28 +- .../vector-interleaved-load-i32-stride-3.ll | 152 ++-- .../vector-interleaved-load-i32-stride-4.ll | 228 +++--- .../vector-interleaved-load-i32-stride-5.ll | 192 +++-- .../vector-interleaved-load-i32-stride-6.ll | 716 +++++++++--------- .../vector-interleaved-load-i32-stride-7.ll | 284 +++---- .../vector-interleaved-load-i32-stride-8.ll | 116 +-- .../vector-interleaved-load-i64-stride-2.ll | 52 +- .../vector-interleaved-load-i64-stride-6.ll | 164 ++-- .../vector-interleaved-load-i64-stride-7.ll | 152 ++-- ...d_vector_inreg_of_broadcast_from_memory.ll | 28 +- 22 files changed, 1919 insertions(+), 1972 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 596139d084570..add51fac4b9e6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41701,6 +41701,11 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, return SDValue(); } +static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, + ArrayRef Ops, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget); + /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, @@ -42401,25 +42406,17 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, return SDValue(); } case X86ISD::VPERMV3: { - SDValue V1 = peekThroughBitcasts(N.getOperand(0)); - SDValue V2 = peekThroughBitcasts(N.getOperand(2)); - MVT SVT = V1.getSimpleValueType(); - // Combine VPERMV3 to widened VPERMV if the two source operands are split - // from the same vector. - if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR && - V1.getConstantOperandVal(1) == 0 && - V2.getOpcode() == ISD::EXTRACT_SUBVECTOR && - V2.getConstantOperandVal(1) == SVT.getVectorNumElements() && - V1.getOperand(0) == V2.getOperand(0)) { - EVT NVT = V1.getOperand(0).getValueType(); - if (NVT.is256BitVector() || - (NVT.is512BitVector() && Subtarget.hasEVEX512())) { - MVT WideVT = MVT::getVectorVT( - VT.getScalarType(), NVT.getSizeInBits() / VT.getScalarSizeInBits()); + // Combine VPERMV3 to widened VPERMV if the two source operands can be + // freely concatenated. + if (VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.useAVX512Regs())) { + SDValue Ops[] = {N.getOperand(0), N.getOperand(2)}; + MVT WideVT = VT.getDoubleNumVectorElementsVT(); + if (SDValue ConcatSrc = + combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) { SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG, DL, WideVT.getSizeInBits()); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, - DAG.getBitcast(WideVT, V1.getOperand(0))); + SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, DAG.getIntPtrConstant(0, DL)); } @@ -42427,6 +42424,9 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SmallVector Ops; SmallVector Mask; if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) { + assert(Mask.size() == NumElts && "Unexpected shuffle mask size"); + SDValue V1 = peekThroughBitcasts(N.getOperand(0)); + SDValue V2 = peekThroughBitcasts(N.getOperand(2)); MVT MaskVT = N.getOperand(1).getSimpleValueType(); // Canonicalize to VPERMV if both sources are the same. if (V1 == V2) { @@ -57369,10 +57369,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, Op0.getOperand(1)); } - // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128. - // Only concat of subvector high halves which vperm2x128 is best at. // TODO: This should go in combineX86ShufflesRecursively eventually. - if (VT.is256BitVector() && NumOps == 2) { + if (NumOps == 2) { SDValue Src0 = peekThroughBitcasts(Ops[0]); SDValue Src1 = peekThroughBitcasts(Ops[1]); if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR && @@ -57381,7 +57379,10 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, EVT SrcVT1 = Src1.getOperand(0).getValueType(); unsigned NumSrcElts0 = SrcVT0.getVectorNumElements(); unsigned NumSrcElts1 = SrcVT1.getVectorNumElements(); - if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() && + // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128. + // Only concat of subvector high halves which vperm2x128 is best at. + if (VT.is256BitVector() && SrcVT0.is256BitVector() && + SrcVT1.is256BitVector() && Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) && Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) { return DAG.getNode(X86ISD::VPERM2X128, DL, VT, @@ -57389,6 +57390,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, DAG.getBitcast(VT, Src1.getOperand(0)), DAG.getTargetConstant(0x31, DL, MVT::i8)); } + // concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> x. + if (Src0.getOperand(0) == Src1.getOperand(0) && + Src0.getConstantOperandAPInt(1) == 0 && + Src1.getConstantOperandAPInt(1) == + Src0.getValueType().getVectorNumElements()) { + return DAG.getBitcast(VT, extractSubVector(Src0.getOperand(0), 0, DAG, + DL, VT.getSizeInBits())); + } } } diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 1305559bc04e0..3d72319f59ca9 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1337,10 +1337,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] -; AVX512BW-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1789,10 +1788,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1808,10 +1806,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -1827,10 +1824,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 5d901a8a380a9..aac5847061cbe 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -149,9 +149,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -160,11 +161,12 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -176,11 +178,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -192,11 +194,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -208,11 +211,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -256,9 +259,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -267,11 +271,12 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2] -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -283,11 +288,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -579,9 +584,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -590,11 +595,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -606,11 +611,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -622,11 +626,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -638,11 +642,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -686,9 +689,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -697,11 +700,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -713,11 +716,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -810,11 +812,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -827,11 +829,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -844,10 +845,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -857,11 +857,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -874,11 +874,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -1082,11 +1081,12 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,0,0,3] -; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1098,11 +1098,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1567,9 +1567,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1578,11 +1578,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12] -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1594,11 +1594,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1610,10 +1609,9 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6] +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1623,11 +1621,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1640,11 +1638,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1691,11 +1688,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm3 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1708,11 +1705,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,15,6,9] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -2474,9 +2470,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2] -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: @@ -2492,11 +2488,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,0,2] -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: @@ -2516,11 +2512,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: @@ -2572,11 +2567,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,7,1] -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: @@ -2596,11 +2591,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: @@ -2620,9 +2614,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2] -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2631,11 +2625,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,2,3,2] -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2647,11 +2641,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -3032,12 +3025,13 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,1,3,7] -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3049,12 +3043,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3066,9 +3060,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3] -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3077,12 +3072,13 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,3] -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3094,12 +3090,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3424,9 +3420,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %v define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3435,12 +3431,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3452,12 +3448,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3469,12 +3464,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8] -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [11,0,9,0,7,14,0,8] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3486,12 +3481,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3724,10 +3718,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9] +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3737,12 +3730,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm3 -; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3755,12 +3748,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -4346,9 +4338,9 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,6,7,2] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4357,12 +4349,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,7,2] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,6,7,2] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4374,12 +4366,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,6,7,2] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4441,12 +4432,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] -; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: @@ -4467,12 +4458,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4] -; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: @@ -4493,9 +4483,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4504,12 +4494,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,2,1,0] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4521,12 +4511,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/pr97968.ll b/llvm/test/CodeGen/X86/pr97968.ll index ca5c63cdc1c2e..a539a33e9a281 100644 --- a/llvm/test/CodeGen/X86/pr97968.ll +++ b/llvm/test/CodeGen/X86/pr97968.ll @@ -5,8 +5,8 @@ define <2 x i32> @PR97968(<16 x i32> %a0) { ; CHECK-LABEL: PR97968: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,2,7] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %sub0 = shufflevector <16 x i32> %a0, <16 x i32> poison, <4 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 45842d4148a8b..82c460fc55938 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -65,10 +65,9 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL-FAST-ALL: # %bb.0: -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-FAST-ALL-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512BWVL-FAST-ALL-NEXT: vmovaps %ymm0, (%rsi) ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper ; AVX512BWVL-FAST-ALL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index e7557134b1486..1d82d57e5552f 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -453,9 +453,8 @@ define <4 x double> @PR34175(ptr %p) { ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512BWVL-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; @@ -472,9 +471,8 @@ define <4 x double> @PR34175(ptr %p) { ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512VBMIVL-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VBMIVL-NEXT: retq %v = load <32 x i16>, ptr %p, align 2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 0cefc1c32d71b..a39bc6b668669 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -345,66 +345,66 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i16_stride3_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride3_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <4 x i32> @@ -629,64 +629,60 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride3_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 68e92d7cf773f..739e6e2369e36 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -596,24 +596,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -623,24 +621,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -650,24 +646,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -677,24 +671,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm2, %eax -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 751412c77a59a..c3b53211978ae 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -293,8 +293,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm5 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vpermw (%rdi), %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 ; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -307,6 +307,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovd %xmm5, (%r8) ; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r9) ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride6_vf2: @@ -346,8 +347,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 4(%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vpermw (%rdi), %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 20(%rdi), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -360,6 +361,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> @@ -580,21 +582,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm0, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) +; AVX512-NEXT: vmovq %xmm2, (%rcx) ; AVX512-NEXT: vmovq %xmm1, (%r8) -; AVX512-NEXT: vmovq %xmm2, (%r9) -; AVX512-NEXT: vmovq %xmm4, (%rax) +; AVX512-NEXT: vmovq %xmm4, (%r9) +; AVX512-NEXT: vmovq %xmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -612,21 +613,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rax) +; AVX512-FCP-NEXT: vmovq %xmm4, (%r9) +; AVX512-FCP-NEXT: vmovq %xmm5, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -645,21 +645,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-NEXT: vmovq %xmm2, (%r9) -; AVX512DQ-NEXT: vmovq %xmm4, (%rax) +; AVX512DQ-NEXT: vmovq %xmm4, (%r9) +; AVX512DQ-NEXT: vmovq %xmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -677,21 +676,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,13,10,3] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -699,25 +697,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%rax) +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -725,25 +722,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%rax) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -751,25 +747,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -777,25 +772,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i16>, ptr %in.vec, align 64 @@ -2865,224 +2859,228 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i16_stride6_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) -; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-NEXT: vmovdqa %ymm6, (%r8) +; AVX512BW-NEXT: vmovdqa %ymm7, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride6_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride6_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm5, %ymm4, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm5, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <96 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 713bd757a7b99..95b5ffde48564 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -321,22 +321,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm8 +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm6 +; AVX512BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm7 +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vpermw (%rdi), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovd %xmm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovd %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovd %xmm7, (%r9) -; AVX512BW-FCP-NEXT: vmovd %xmm3, (%r10) -; AVX512BW-FCP-NEXT: vmovd %xmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovd %xmm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r9) +; AVX512BW-FCP-NEXT: vmovd %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rax) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride7_vf2: @@ -378,22 +379,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vpermw (%rdi), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm7, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <14 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> @@ -906,28 +908,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-NEXT: vmovq %xmm8, (%rax) +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-NEXT: vmovq %xmm6, (%r10) +; AVX512BW-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -936,28 +937,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r10) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -966,28 +966,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r10) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -996,28 +995,27 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <28 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 051b4e300b827..fff21f9aad1bb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -623,31 +623,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r11) -; AVX512BW-NEXT: vmovq %xmm8, (%r10) -; AVX512BW-NEXT: vmovq %xmm9, (%rax) +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-NEXT: vmovq %xmm6, (%r11) +; AVX512BW-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -657,31 +656,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512BW-FCP-NEXT: vmovq %xmm8, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm9, (%rax) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -691,31 +689,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r11) -; AVX512DQ-BW-NEXT: vmovq %xmm8, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm9, (%rax) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r11) +; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -725,31 +722,30 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm9, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index 7cb46b79f7f36..f2c5a91d2cca3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -363,11 +363,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP-LABEL: load_i32_stride2_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -385,11 +384,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512DQ-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -407,11 +405,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP-LABEL: load_i32_stride2_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -429,11 +426,10 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 213c5febfca23..d9383f524f1d1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -310,128 +310,120 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride3_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride3_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride3_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride3_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-NEXT: vmovaps %xmm1, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,7,10] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,5,8,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 61f91b2bb0c0c..0bf1260738439 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -106,13 +106,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride4_vf2: @@ -134,13 +135,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride4_vf2: @@ -162,13 +164,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf2: @@ -190,13 +193,14 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <8 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> @@ -361,152 +365,144 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride4_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-NEXT: vmovaps %xmm3, (%rcx) +; AVX512-NEXT: vmovaps %xmm1, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride4_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride4_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride4_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) +; AVX512BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512BW-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-NEXT: vmovaps %xmm3, (%rcx) +; AVX512BW-NEXT: vmovaps %xmm1, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index d8d48b0b8c73d..c08442f9d9d01 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -144,19 +144,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i32_stride5_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -188,19 +188,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i32_stride5_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -232,19 +232,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i32_stride5_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper @@ -276,19 +276,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i32_stride5_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -491,18 +491,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512-NEXT: vmovdqa %xmm5, (%r8) ; AVX512-NEXT: vmovdqa %xmm6, (%r9) @@ -514,18 +513,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9) @@ -537,18 +535,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9) @@ -560,18 +557,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9) @@ -583,18 +579,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) @@ -606,18 +601,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) @@ -629,18 +623,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) @@ -652,18 +645,17 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,11,16] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 3ba41ad07ce83..ae3e5445bf266 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -192,29 +192,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i32_stride6_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -252,29 +251,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -312,29 +310,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i32_stride6_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -372,29 +369,28 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 @@ -1291,352 +1287,360 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride6_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512-NEXT: vmovdqa %ymm5, (%r8) -; AVX512-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-NEXT: vmovdqa %ymm2, (%rax) +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512-NEXT: vmovdqa %ymm3, (%r8) +; AVX512-NEXT: vmovdqa %ymm6, (%r9) +; AVX512-NEXT: vmovdqa %ymm4, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride6_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride6_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512BW-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512BW-NEXT: vmovdqa %ymm5, (%r8) -; AVX512BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512BW-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512BW-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-NEXT: vmovdqa %ymm3, (%r8) +; AVX512BW-NEXT: vmovdqa %ymm6, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm4, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride6_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride6_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-BW-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-BW-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,6,12,18,24,30,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,7,13,19,25,31,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [2,8,14,20,26,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <48 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index d806253ef23a0..694f2bc53c515 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -204,22 +204,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512-NEXT: vmovaps (%rdi), %ymm5 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512-NEXT: vmovq %xmm2, (%rsi) ; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovq %xmm1, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) -; AVX512-NEXT: vmovq %xmm5, (%rax) +; AVX512-NEXT: vmovlps %xmm1, (%r9) +; AVX512-NEXT: vmovlps %xmm7, (%r10) +; AVX512-NEXT: vmovlps %xmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -227,30 +227,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -269,22 +270,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-NEXT: vmovlps %xmm1, (%r9) +; AVX512DQ-NEXT: vmovlps %xmm7, (%r10) +; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -292,30 +293,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -334,22 +336,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovaps (%rdi), %ymm5 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovq %xmm1, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-NEXT: vmovq %xmm5, (%rax) +; AVX512BW-NEXT: vmovlps %xmm1, (%r9) +; AVX512BW-NEXT: vmovlps %xmm7, (%r10) +; AVX512BW-NEXT: vmovlps %xmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -357,30 +359,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -399,22 +402,22 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9) +; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10) +; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -422,30 +425,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [7,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,11,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,6,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <14 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index f0c95f4fa9ef8..8d7f8d1db8522 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -222,24 +222,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -287,24 +288,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -352,24 +354,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -417,24 +420,25 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,13,5,5] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,5,13,5,5] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll index 2381df6d73289..aa7d8ceb14950 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -245,13 +245,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP-LABEL: load_i64_stride2_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -271,13 +270,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -297,13 +295,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -323,13 +320,12 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <8 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index f82bcd1ce3e1e..7d3209397c3df 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -611,32 +611,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -694,32 +693,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512DQ-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -777,32 +775,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512BW-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -860,32 +857,31 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,2,4] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [11,1,7,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,0,0,6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [4,10] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] +; AVX512DQ-BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 4e5501b1041d3..cc3e5f3d1d82e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -709,28 +709,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -739,9 +739,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -814,28 +814,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -844,9 +844,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -919,28 +919,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -949,9 +949,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1024,28 +1024,28 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] @@ -1054,9 +1054,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 181f5651784d8..acedcf4263906 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1337,10 +1337,9 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] -; AVX512BW-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1789,10 +1788,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1808,10 +1806,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -1827,10 +1824,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq From 52b539b9d932bb15ac52c1bbfc98b9a7a2593d2c Mon Sep 17 00:00:00 2001 From: Ryan Mansfield Date: Mon, 13 Jan 2025 09:15:24 -0500 Subject: [PATCH 062/102] [llvm-objdump] Remove leading whitespace for PT_GNU_PROPERTY. (#121591) This fixes the misaligned display of addresses for this p_type. Previous: ``` STACK off 0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**64 filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw- PROPERTY off 0x0000000000000358 vaddr 0x0000000000000358 paddr 0x0000000000000358 align 2**3 filesz 0x0000000000000020 memsz 0x0000000000000020 flags r-- NOTE off 0x0000000000000334 vaddr 0x0000000000000334 paddr 0x0000000000000334 align 2**2 filesz 0x0000000000000020 memsz 0x0000000000000020 flags r-- ``` After: ``` STACK off 0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**64 filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw- PROPERTY off 0x0000000000000358 vaddr 0x0000000000000358 paddr 0x0000000000000358 align 2**3 filesz 0x0000000000000020 memsz 0x0000000000000020 flags r-- NOTE off 0x0000000000000334 vaddr 0x0000000000000334 paddr 0x0000000000000334 align 2**2 filesz 0x0000000000000020 memsz 0x0000000000000020 flags r-- ``` --- llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test | 2 +- llvm/tools/llvm-objdump/ELFDump.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test b/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test index 246337866a777..27f1252b3d5d1 100644 --- a/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test +++ b/llvm/test/tools/llvm-objdump/ELF/pt-gnu-property.test @@ -2,7 +2,7 @@ # RUN: llvm-objdump -p %t | FileCheck %s # CHECK: Program Header: -# CHECK-NEXT: {{ }}PROPERTY{{ }} +# CHECK-NEXT: {{^}}PROPERTY{{ }} --- !ELF FileHeader: diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp index d78cf485587e1..e9e5b059f1786 100644 --- a/llvm/tools/llvm-objdump/ELFDump.cpp +++ b/llvm/tools/llvm-objdump/ELFDump.cpp @@ -269,7 +269,7 @@ template void ELFDumper::printProgramHeaders() { outs() << " RELRO "; break; case ELF::PT_GNU_PROPERTY: - outs() << " PROPERTY "; + outs() << "PROPERTY "; break; case ELF::PT_GNU_STACK: outs() << " STACK "; From fd605d0c3086f369f01497a56f7c567491881b0e Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 13 Jan 2025 09:16:23 -0500 Subject: [PATCH 063/102] [libc++] Redefine Fuchsia locale base support on top of the new API (#122489) This follows the same path we've been doing for all platforms so far, moving away from the old definition of the locale base API. Co-authored-by: Daniel Thornburgh --- libcxx/include/CMakeLists.txt | 4 +- libcxx/include/__locale_dir/locale_base_api.h | 4 +- .../__locale_dir/locale_base_api/fuchsia.h | 18 --- libcxx/include/__locale_dir/support/fuchsia.h | 143 ++++++++++++++++++ .../support/no_locale/characters.h | 98 ++++++++++++ .../__locale_dir/support/no_locale/strtonum.h | 49 ++++++ libcxx/include/module.modulemap | 4 +- 7 files changed, 298 insertions(+), 22 deletions(-) delete mode 100644 libcxx/include/__locale_dir/locale_base_api/fuchsia.h create mode 100644 libcxx/include/__locale_dir/support/fuchsia.h create mode 100644 libcxx/include/__locale_dir/support/no_locale/characters.h create mode 100644 libcxx/include/__locale_dir/support/no_locale/strtonum.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index e152383a329fe..f3313bf53460a 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -499,7 +499,6 @@ set(files __locale_dir/locale_base_api.h __locale_dir/locale_base_api/android.h __locale_dir/locale_base_api/bsd_locale_fallbacks.h - __locale_dir/locale_base_api/fuchsia.h __locale_dir/locale_base_api/ibm.h __locale_dir/locale_base_api/musl.h __locale_dir/locale_base_api/openbsd.h @@ -507,6 +506,9 @@ set(files __locale_dir/support/apple.h __locale_dir/support/bsd_like.h __locale_dir/support/freebsd.h + __locale_dir/support/fuchsia.h + __locale_dir/support/no_locale/characters.h + __locale_dir/support/no_locale/strtonum.h __locale_dir/support/windows.h __math/abs.h __math/copysign.h diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h index bb0da889f4c84..b112a4aef7765 100644 --- a/libcxx/include/__locale_dir/locale_base_api.h +++ b/libcxx/include/__locale_dir/locale_base_api.h @@ -99,6 +99,8 @@ # include <__locale_dir/support/freebsd.h> #elif defined(_LIBCPP_MSVCRT_LIKE) # include <__locale_dir/support/windows.h> +#elif defined(__Fuchsia__) +# include <__locale_dir/support/fuchsia.h> #else // TODO: This is a temporary definition to bridge between the old way we defined the locale base API @@ -111,8 +113,6 @@ # include <__locale_dir/locale_base_api/android.h> # elif defined(__OpenBSD__) # include <__locale_dir/locale_base_api/openbsd.h> -# elif defined(__Fuchsia__) -# include <__locale_dir/locale_base_api/fuchsia.h> # elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC # include <__locale_dir/locale_base_api/musl.h> # endif diff --git a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h b/libcxx/include/__locale_dir/locale_base_api/fuchsia.h deleted file mode 100644 index f6ef454ba7ada..0000000000000 --- a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h +++ /dev/null @@ -1,18 +0,0 @@ -// -*- C++ -*- -//===-----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H -#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H - -#include <__support/xlocale/__posix_l_fallback.h> -#include <__support/xlocale/__strtonum_fallback.h> -#include -#include - -#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H diff --git a/libcxx/include/__locale_dir/support/fuchsia.h b/libcxx/include/__locale_dir/support/fuchsia.h new file mode 100644 index 0000000000000..4a54896c8e268 --- /dev/null +++ b/libcxx/include/__locale_dir/support/fuchsia.h @@ -0,0 +1,143 @@ +//===-----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H +#define _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H + +#include <__config> +#include <__utility/forward.h> +#include // uselocale & friends +#include +#include +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __locale { + +struct __locale_guard { + _LIBCPP_HIDE_FROM_ABI __locale_guard(locale_t& __loc) : __old_loc_(::uselocale(__loc)) {} + + _LIBCPP_HIDE_FROM_ABI ~__locale_guard() { + if (__old_loc_) + ::uselocale(__old_loc_); + } + + locale_t __old_loc_; + + __locale_guard(__locale_guard const&) = delete; + __locale_guard& operator=(__locale_guard const&) = delete; +}; + +// +// Locale management +// +using __locale_t = locale_t; + +inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) { + return ::newlocale(__category_mask, __name, __loc); +} + +inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::freelocale(__loc); } + +inline _LIBCPP_HIDE_FROM_ABI lconv* __localeconv(__locale_t& __loc) { + __locale_guard __current(__loc); + return std::localeconv(); +} + +// +// Other functions +// +inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __mb_len_max(__locale_t __loc) { + __locale_guard __current(__loc); + return MB_CUR_MAX; +} +#if _LIBCPP_HAS_WIDE_CHARACTERS +inline _LIBCPP_HIDE_FROM_ABI wint_t __btowc(int __ch, __locale_t __loc) { + __locale_guard __current(__loc); + return std::btowc(__ch); +} +inline _LIBCPP_HIDE_FROM_ABI int __wctob(wint_t __ch, __locale_t __loc) { + __locale_guard __current(__loc); + return std::wctob(__ch); +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__wcsnrtombs(char* __dest, const wchar_t** __src, size_t __nwc, size_t __len, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return ::wcsnrtombs(__dest, __src, __nwc, __len, __ps); // non-standard +} +inline _LIBCPP_HIDE_FROM_ABI size_t __wcrtomb(char* __s, wchar_t __ch, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return std::wcrtomb(__s, __ch, __ps); +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__mbsnrtowcs(wchar_t* __dest, const char** __src, size_t __nms, size_t __len, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return ::mbsnrtowcs(__dest, __src, __nms, __len, __ps); // non-standard +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__mbrtowc(wchar_t* __pwc, const char* __s, size_t __n, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return std::mbrtowc(__pwc, __s, __n, __ps); +} +inline _LIBCPP_HIDE_FROM_ABI int __mbtowc(wchar_t* __pwc, const char* __pmb, size_t __max, __locale_t __loc) { + __locale_guard __current(__loc); + return std::mbtowc(__pwc, __pmb, __max); +} +inline _LIBCPP_HIDE_FROM_ABI size_t __mbrlen(const char* __s, size_t __n, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return std::mbrlen(__s, __n, __ps); +} +inline _LIBCPP_HIDE_FROM_ABI size_t +__mbsrtowcs(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, __locale_t __loc) { + __locale_guard __current(__loc); + return ::mbsrtowcs(__dest, __src, __len, __ps); +} +#endif + +_LIBCPP_DIAGNOSTIC_PUSH +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat") +_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wformat-nonliteral") // GCC doesn't support [[gnu::format]] on variadic templates +#ifdef _LIBCPP_COMPILER_CLANG_BASED +# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) _LIBCPP_ATTRIBUTE_FORMAT(__VA_ARGS__) +#else +# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) /* nothing */ +#endif + +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __snprintf( + char* __s, size_t __n, __locale_t __loc, const char* __format, _Args&&... __args) { + __locale_guard __current(__loc); + return std::snprintf(__s, __n, __format, std::forward<_Args>(__args)...); +} +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __asprintf( + char** __s, __locale_t __loc, const char* __format, _Args&&... __args) { + __locale_guard __current(__loc); + return ::asprintf(__s, __format, std::forward<_Args>(__args)...); // non-standard +} +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf( + const char* __s, __locale_t __loc, const char* __format, _Args&&... __args) { + __locale_guard __current(__loc); + return std::sscanf(__s, __format, std::forward<_Args>(__args)...); +} + +_LIBCPP_DIAGNOSTIC_POP +#undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT + +} // namespace __locale +_LIBCPP_END_NAMESPACE_STD + +#include <__locale_dir/support/no_locale/characters.h> +#include <__locale_dir/support/no_locale/strtonum.h> + +#endif // _LIBCPP___LOCALE_DIR_SUPPORT_FUCHSIA_H diff --git a/libcxx/include/__locale_dir/support/no_locale/characters.h b/libcxx/include/__locale_dir/support/no_locale/characters.h new file mode 100644 index 0000000000000..20e45fc350e2e --- /dev/null +++ b/libcxx/include/__locale_dir/support/no_locale/characters.h @@ -0,0 +1,98 @@ +//===-----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H +#define _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H + +#include <__config> +#include <__cstddef/size_t.h> +#include +#include +#include +#include +#if _LIBCPP_HAS_WIDE_CHARACTERS +# include +#endif + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __locale { + +// +// Character manipulation functions +// +inline _LIBCPP_HIDE_FROM_ABI int __islower(int __c, __locale_t) { return std::islower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __c, __locale_t) { return std::isupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __c, __locale_t) { return std::isdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __c, __locale_t) { return std::isxdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __c, __locale_t) { return std::toupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __c, __locale_t) { return std::tolower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __strcoll(const char* __s1, const char* __s2, __locale_t) { + return std::strcoll(__s1, __s2); +} + +inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, size_t __n, __locale_t) { + return std::strxfrm(__dest, __src, __n); +} + +#if _LIBCPP_HAS_WIDE_CHARACTERS +inline _LIBCPP_HIDE_FROM_ABI int __iswctype(wint_t __c, wctype_t __type, __locale_t) { + return std::iswctype(__c, __type); +} + +inline _LIBCPP_HIDE_FROM_ABI int __iswspace(wint_t __c, __locale_t) { return std::iswspace(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswprint(wint_t __c, __locale_t) { return std::iswprint(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswcntrl(wint_t __c, __locale_t) { return std::iswcntrl(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswupper(wint_t __c, __locale_t) { return std::iswupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswlower(wint_t __c, __locale_t) { return std::iswlower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswalpha(wint_t __c, __locale_t) { return std::iswalpha(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswblank(wint_t __c, __locale_t) { return std::iswblank(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswdigit(wint_t __c, __locale_t) { return std::iswdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswpunct(wint_t __c, __locale_t) { return std::iswpunct(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __iswxdigit(wint_t __c, __locale_t) { return std::iswxdigit(__c); } + +inline _LIBCPP_HIDE_FROM_ABI wint_t __towupper(wint_t __c, __locale_t) { return std::towupper(__c); } + +inline _LIBCPP_HIDE_FROM_ABI wint_t __towlower(wint_t __c, __locale_t) { return std::towlower(__c); } + +inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __ws1, const wchar_t* __ws2, __locale_t) { + return std::wcscoll(__ws1, __ws2); +} + +inline _LIBCPP_HIDE_FROM_ABI size_t __wcsxfrm(wchar_t* __dest, const wchar_t* __src, size_t __n, __locale_t) { + return std::wcsxfrm(__dest, __src, __n); +} +#endif // _LIBCPP_HAS_WIDE_CHARACTERS + +inline _LIBCPP_HIDE_FROM_ABI size_t +__strftime(char* __s, size_t __max, const char* __format, const struct tm* __tm, __locale_t) { + return std::strftime(__s, __max, __format, __tm); +} + +} // namespace __locale +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_CHARACTERS_H diff --git a/libcxx/include/__locale_dir/support/no_locale/strtonum.h b/libcxx/include/__locale_dir/support/no_locale/strtonum.h new file mode 100644 index 0000000000000..0e7a32993e736 --- /dev/null +++ b/libcxx/include/__locale_dir/support/no_locale/strtonum.h @@ -0,0 +1,49 @@ +//===-----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H +#define _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H + +#include <__config> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __locale { + +// +// Strtonum functions +// +inline _LIBCPP_HIDE_FROM_ABI float __strtof(const char* __nptr, char** __endptr, __locale_t) { + return std::strtof(__nptr, __endptr); +} + +inline _LIBCPP_HIDE_FROM_ABI double __strtod(const char* __nptr, char** __endptr, __locale_t) { + return std::strtod(__nptr, __endptr); +} + +inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __endptr, __locale_t) { + return std::strtold(__nptr, __endptr); +} + +inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t) { + return std::strtoll(__nptr, __endptr, __base); +} + +inline _LIBCPP_HIDE_FROM_ABI unsigned long long +__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t) { + return std::strtoull(__nptr, __endptr, __base); +} + +} // namespace __locale +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___LOCALE_DIR_SUPPORT_NO_LOCALE_STRTONUM_H diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index e3204820b5c25..69f1b7d094ada 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1480,13 +1480,15 @@ module std [system] { textual header "__locale_dir/support/apple.h" textual header "__locale_dir/support/bsd_like.h" textual header "__locale_dir/support/freebsd.h" + textual header "__locale_dir/support/fuchsia.h" + textual header "__locale_dir/support/no_locale/characters.h" + textual header "__locale_dir/support/no_locale/strtonum.h" textual header "__locale_dir/support/windows.h" } module locale_base_api { textual header "__locale_dir/locale_base_api/android.h" textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" - textual header "__locale_dir/locale_base_api/fuchsia.h" textual header "__locale_dir/locale_base_api/ibm.h" textual header "__locale_dir/locale_base_api/musl.h" textual header "__locale_dir/locale_base_api/openbsd.h" From 3d65e9dfd0e7dff37ca46e12f980e06baa43a25c Mon Sep 17 00:00:00 2001 From: vfdev Date: Mon, 13 Jan 2025 15:19:23 +0100 Subject: [PATCH 064/102] Remove StandaloneExtensionPybind11.cpp FT update as does not work with python 3.8 and old pybind11 (#122697) Description: - Remove StandaloneExtensionPybind11.cpp FT update as does not work with python 3.8 and old pybind11 This should also fix the failing toy.test: https://github.com/llvm/llvm-project/pull/122684#issuecomment-2586802692 cc @jpienaar --- .../standalone/python/StandaloneExtensionPybind11.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp index dd3c4c2945cca..397db4c20e743 100644 --- a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp +++ b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp @@ -12,11 +12,9 @@ #include "Standalone-c/Dialects.h" #include "mlir/Bindings/Python/PybindAdaptors.h" -namespace py = pybind11; - using namespace mlir::python::adaptors; -PYBIND11_MODULE(_standaloneDialectsPybind11, m, py::mod_gil_not_used()) { +PYBIND11_MODULE(_standaloneDialectsPybind11, m) { //===--------------------------------------------------------------------===// // standalone dialect //===--------------------------------------------------------------------===// From b525d7bb2729687b21206f163eab348f6a8a5fe1 Mon Sep 17 00:00:00 2001 From: bernhardu Date: Mon, 13 Jan 2025 15:21:03 +0100 Subject: [PATCH 065/102] [win/asan] GetInstructionSize: Support some more 2 byte instructions. (#120235) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds several instructions seen when trying to run a executable built with ASan with llvm-mingw. (x86 and x86_64, using the git tip in llvm-project). Also includes instructions collected by Roman Pišl and Eric Pouech in the Wine bug reports below. ``` Related: https://github.com/llvm/llvm-project/issues/96270 Co-authored-by: Roman Pišl https://bugs.winehq.org/show_bug.cgi?id=50993 https://bugs.winehq.org/attachment.cgi?id=70233 Co-authored-by: Eric Pouech https://bugs.winehq.org/show_bug.cgi?id=52386 https://bugs.winehq.org/attachment.cgi?id=71626 ``` CC: @zmodem --- compiler-rt/lib/interception/interception_win.cpp | 5 +++++ compiler-rt/lib/interception/tests/interception_win_test.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp index bd85c50a083a6..7a1a47a78dbc6 100644 --- a/compiler-rt/lib/interception/interception_win.cpp +++ b/compiler-rt/lib/interception/interception_win.cpp @@ -636,12 +636,17 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) { case 0xFF8B: // 8B FF : mov edi, edi case 0xEC8B: // 8B EC : mov ebp, esp case 0xc889: // 89 C8 : mov eax, ecx + case 0xD189: // 89 D1 : mov ecx, edx case 0xE589: // 89 E5 : mov ebp, esp case 0xC18B: // 8B C1 : mov eax, ecx + case 0xC031: // 31 C0 : xor eax, eax + case 0xC931: // 31 C9 : xor ecx, ecx + case 0xD231: // 31 D2 : xor edx, edx case 0xC033: // 33 C0 : xor eax, eax case 0xC933: // 33 C9 : xor ecx, ecx case 0xD233: // 33 D2 : xor edx, edx case 0xDB84: // 84 DB : test bl,bl + case 0xC084: // 84 C0 : test al,al case 0xC984: // 84 C9 : test cl,cl case 0xD284: // 84 D2 : test dl,dl return 2; diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp index 3a2d8b271113d..e0258a3d0bd51 100644 --- a/compiler-rt/lib/interception/tests/interception_win_test.cpp +++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp @@ -839,14 +839,19 @@ const struct InstructionSizeData { { 1, {0x90}, 0, "90 : nop"}, { 1, {0xC3}, 0, "C3 : ret (for small/empty function interception"}, { 1, {0xCC}, 0, "CC : int 3 i.e. registering weak functions)"}, + { 2, {0x31, 0xC0}, 0, "31 C0 : xor eax, eax"}, + { 2, {0x31, 0xC9}, 0, "31 C9 : xor ecx, ecx"}, + { 2, {0x31, 0xD2}, 0, "31 D2 : xor edx, edx"}, { 2, {0x33, 0xC0}, 0, "33 C0 : xor eax, eax"}, { 2, {0x33, 0xC9}, 0, "33 C9 : xor ecx, ecx"}, { 2, {0x33, 0xD2}, 0, "33 D2 : xor edx, edx"}, { 2, {0x6A, 0x71}, 0, "6A XX : push XX"}, + { 2, {0x84, 0xC0}, 0, "84 C0 : test al,al"}, { 2, {0x84, 0xC9}, 0, "84 C9 : test cl,cl"}, { 2, {0x84, 0xD2}, 0, "84 D2 : test dl,dl"}, { 2, {0x84, 0xDB}, 0, "84 DB : test bl,bl"}, { 2, {0x89, 0xc8}, 0, "89 C8 : mov eax, ecx"}, + { 2, {0x89, 0xD1}, 0, "89 D1 : mov ecx, edx"}, { 2, {0x89, 0xE5}, 0, "89 E5 : mov ebp, esp"}, { 2, {0x8A, 0x01}, 0, "8A 01 : mov al, byte ptr [ecx]"}, { 2, {0x8B, 0xC1}, 0, "8B C1 : mov eax, ecx"}, From 6cccd438138aaeb175e3aec8d7158694b8dfd4e9 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Mon, 13 Jan 2025 15:05:47 +0000 Subject: [PATCH 066/102] [AArch64] Fix aarch64-fujitsu-monaka.c test (#122716) --- .../Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c index 3c74e3620df03..01a97a00de542 100644 --- a/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c +++ b/clang/test/Driver/print-enabled-extensions/aarch64-fujitsu-monaka.c @@ -28,8 +28,6 @@ // CHECK-NEXT: FEAT_FP16 Enable half-precision floating-point data processing // CHECK-NEXT: FEAT_FP8 Enable FP8 instructions // CHECK-NEXT: FEAT_FP8DOT2 Enable FP8 2-way dot instructions -// CHECK-NEXT: FEAT_FP8DOT4 Enable FP8 4-way dot instructions -// CHECK-NEXT: FEAT_FP8FMA Enable Armv9.5-A FP8 multiply-add instructions // CHECK-NEXT: FEAT_FPAC Enable Armv8.3-A Pointer Authentication Faulting enhancement // CHECK-NEXT: FEAT_FRINTTS Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int // CHECK-NEXT: FEAT_FlagM Enable Armv8.4-A Flag Manipulation instructions From 0ffdf9e1afa9cb8222b6cb176028eb3e0790ba29 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Mon, 13 Jan 2025 15:11:40 +0000 Subject: [PATCH 067/102] Revert "[Multilib] Custom flags YAML parsing" (#122722) Reverts llvm/llvm-project#110657 It seems that this patch is causing the sanitizer bot to fail. Reverting while I investigate --- clang/include/clang/Driver/Multilib.h | 28 +--- clang/lib/Driver/Multilib.cpp | 73 ++-------- ...remetal-multilib-custom-flags-parsing.yaml | 133 ------------------ 3 files changed, 11 insertions(+), 223 deletions(-) delete mode 100644 clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h index 1dab45c062aee..dbed70f4f9008 100644 --- a/clang/include/clang/Driver/Multilib.h +++ b/clang/include/clang/Driver/Multilib.h @@ -101,25 +101,6 @@ class Multilib { raw_ostream &operator<<(raw_ostream &OS, const Multilib &M); -namespace custom_flag { -struct Declaration; -using DeclarationPtr = std::shared_ptr; - -struct ValueDetail { - std::string Name; - std::optional> MacroDefines; - DeclarationPtr Decl; -}; - -struct Declaration { - std::string Name; - SmallVector ValueList; - std::optional DefaultValueIdx; -}; - -static constexpr StringRef Prefix = "-fmultilib-flag="; -} // namespace custom_flag - /// See also MultilibSetBuilder for combining multilibs into a set. class MultilibSet { public: @@ -139,18 +120,15 @@ class MultilibSet { private: multilib_list Multilibs; - SmallVector FlagMatchers; - SmallVector CustomFlagDecls; + std::vector FlagMatchers; IncludeDirsFunc IncludeCallback; IncludeDirsFunc FilePathsCallback; public: MultilibSet() = default; MultilibSet(multilib_list &&Multilibs, - SmallVector &&FlagMatchers = {}, - SmallVector &&CustomFlagDecls = {}) - : Multilibs(std::move(Multilibs)), FlagMatchers(std::move(FlagMatchers)), - CustomFlagDecls(std::move(CustomFlagDecls)) {} + std::vector &&FlagMatchers = {}) + : Multilibs(Multilibs), FlagMatchers(FlagMatchers) {} const multilib_list &getMultilibs() { return Multilibs; } diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp index b4b5dbd1bdb5e..0207e0f2eb2de 100644 --- a/clang/lib/Driver/Multilib.cpp +++ b/clang/lib/Driver/Multilib.cpp @@ -10,7 +10,6 @@ #include "clang/Basic/LLVM.h" #include "clang/Driver/Driver.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" @@ -202,20 +201,13 @@ struct MultilibGroupSerialization { struct MultilibSetSerialization { llvm::VersionTuple MultilibVersion; - SmallVector Groups; - SmallVector Multilibs; - SmallVector FlagMatchers; - SmallVector CustomFlagDeclarations; + std::vector Groups; + std::vector Multilibs; + std::vector FlagMatchers; }; } // end anonymous namespace -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) -LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::ValueDetail) -LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::DeclarationPtr) - template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSerialization &V) { io.mapOptional("Dir", V.Dir); @@ -263,63 +255,11 @@ template <> struct llvm::yaml::MappingTraits { } }; -template <> -struct llvm::yaml::MappingContextTraits> { - static void mapping(llvm::yaml::IO &io, custom_flag::ValueDetail &V, - llvm::SmallSet &) { - io.mapRequired("Name", V.Name); - io.mapOptional("MacroDefines", V.MacroDefines); - } - static std::string validate(IO &io, custom_flag::ValueDetail &V, - llvm::SmallSet &NameSet) { - if (V.Name.empty()) - return "custom flag value requires a name"; - if (!NameSet.insert(V.Name).second) - return "duplicate custom flag value name: \"" + V.Name + "\""; - return {}; - } -}; - -template <> -struct llvm::yaml::MappingContextTraits> { - static void mapping(llvm::yaml::IO &io, custom_flag::DeclarationPtr &V, - llvm::SmallSet &NameSet) { - assert(!V); - V = std::make_shared(); - io.mapRequired("Name", V->Name); - io.mapRequired("Values", V->ValueList, NameSet); - std::string DefaultValueName; - io.mapRequired("Default", DefaultValueName); - - for (auto [Idx, Value] : llvm::enumerate(V->ValueList)) { - Value.Decl = V; - if (Value.Name == DefaultValueName) { - assert(!V->DefaultValueIdx); - V->DefaultValueIdx = Idx; - } - } - } - static std::string validate(IO &io, custom_flag::DeclarationPtr &V, - llvm::SmallSet &) { - if (V->Name.empty()) - return "custom flag requires a name"; - if (V->ValueList.empty()) - return "custom flag must have at least one value"; - if (!V->DefaultValueIdx) - return "custom flag must have a default value"; - return {}; - } -}; - template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSetSerialization &M) { io.mapRequired("MultilibVersion", M.MultilibVersion); io.mapRequired("Variants", M.Multilibs); io.mapOptional("Groups", M.Groups); - llvm::SmallSet NameSet; - io.mapOptionalWithContext("Flags", M.CustomFlagDeclarations, NameSet); io.mapOptional("Mappings", M.FlagMatchers); } static std::string validate(IO &io, MultilibSetSerialization &M) { @@ -348,6 +288,10 @@ template <> struct llvm::yaml::MappingTraits { } }; +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) + llvm::ErrorOr MultilibSet::parseYaml(llvm::MemoryBufferRef Input, llvm::SourceMgr::DiagHandlerTy DiagHandler, @@ -375,8 +319,7 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input, } } - return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers), - std::move(MS.CustomFlagDeclarations)); + return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers)); } LLVM_DUMP_METHOD void MultilibSet::dump() const { diff --git a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml deleted file mode 100644 index fe6a9a8d7f1ee..0000000000000 --- a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml +++ /dev/null @@ -1,133 +0,0 @@ -# RUN: split-file %s %t - -# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-without-macro-defines.yaml %s -### -o /dev/null 2>&1 \ -# RUN: | FileCheck %s -# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-with-macro-defines.yaml %s -### -o /dev/null 2>&1 \ -# RUN: | FileCheck %s -# CHECK-NOT: error: - -# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-name.yaml %s -### -o /dev/null 2>&1 \ -# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-NAME -# CHECK-MISSING-FLAG-NAME: error: custom flag requires a name - -# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-values.yaml %s -### -o /dev/null 2>&1 \ -# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUES -# CHECK-MISSING-FLAG-VALUES: error: custom flag must have at least one value - -# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-default.yaml %s -### -o /dev/null 2>&1 \ -# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-DEFAULT -# CHECK-MISSING-FLAG-VALUE-DEFAULT: error: custom flag must have a default value - -# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ -# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-NAME -# CHECK-MISSING-FLAG-VALUE-NAME: error: custom flag value requires a name - -# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/duplicate-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ -# RUN: | FileCheck %s --check-prefix=CHECK-DUPLICATE-FLAG-VALUE-NAME -# CHECK-DUPLICATE-FLAG-VALUE-NAME: error: duplicate custom flag value name: "value-name" -# CHECK-DUPLICATE-FLAG-VALUE-NAME-NEXT: - Name: value-name - -#--- multilib-without-macro-defines.yaml ---- -MultilibVersion: 1.0 - -Variants: -- Dir: libc - Flags: [-fmultilib-flag=a] - -Flags: - - Name: flag - Values: - - Name: a - - Name: b - Default: a - -#--- multilib-with-macro-defines.yaml ---- -MultilibVersion: 1.0 - -Variants: -- Dir: libc - Flags: [-fmultilib-flag=a] - -Flags: - - Name: flag - Values: - - Name: a - MacroDefines: [FEATURE_A] - - Name: b - MacroDefines: [FEATURE_B] - Default: a - -#--- missing-flag-name.yaml ---- -MultilibVersion: 1.0 - -Variants: -- Dir: libc - Flags: [-fmultilib-flag=a] - -Flags: - - Values: - - Name: a - Default: a - -#--- missing-flag-values.yaml ---- -MultilibVersion: 1.0 - -Variants: -- Dir: libc - Flags: [-fmultilib-flag=a] - -Flags: - - Name: flag - Values: - Default: a - -#--- missing-flag-value-default.yaml ---- -MultilibVersion: 1.0 - -Variants: -- Dir: libc - Flags: [-fmultilib-flag=a] - -Flags: - - Name: flag - Values: - - Name: a - Default: - -#--- missing-flag-value-name.yaml ---- -MultilibVersion: 1.0 - -Variants: -- Dir: libc - Flags: [-fmultilib-flag=a] - -Flags: - - Name: flag - Values: - - Name: - Default: a - -#--- duplicate-flag-value-name.yaml ---- -MultilibVersion: 1.0 - -Variants: -- Dir: libc - Flags: [-fmultilib-flag=value-name] - -Flags: - - Name: a - Values: - - Name: value-name - - Name: value-a - Default: value-name - - Name: b - Values: - - Name: value-name - Default: value-name From 5e2d3097f5a6ca67708b73bb3f767a0a9f14395d Mon Sep 17 00:00:00 2001 From: Brotcrunsher Date: Mon, 13 Jan 2025 16:16:23 +0100 Subject: [PATCH 068/102] [libcxx] Don't hold the lock when calling notify_* on gates in std::shared_mutex (#107876) Holding the associated lock while calling notify_* on a condition_variable is generally considered a pessimization, as the notified thread might "instantly" wake up, notice that it can't acquire the lock, and then goes back to sleep. --- libcxx/src/shared_mutex.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/libcxx/src/shared_mutex.cpp b/libcxx/src/shared_mutex.cpp index 1a346dda027f8..6180833736956 100644 --- a/libcxx/src/shared_mutex.cpp +++ b/libcxx/src/shared_mutex.cpp @@ -38,8 +38,10 @@ bool __shared_mutex_base::try_lock() { } void __shared_mutex_base::unlock() { - lock_guard _(__mut_); - __state_ = 0; + { + lock_guard _(__mut_); + __state_ = 0; + } __gate1_.notify_all(); } @@ -67,16 +69,20 @@ bool __shared_mutex_base::try_lock_shared() { } void __shared_mutex_base::unlock_shared() { - lock_guard _(__mut_); + unique_lock lk(__mut_); unsigned num_readers = (__state_ & __n_readers_) - 1; __state_ &= ~__n_readers_; __state_ |= num_readers; if (__state_ & __write_entered_) { - if (num_readers == 0) + if (num_readers == 0) { + lk.unlock(); __gate2_.notify_one(); + } } else { - if (num_readers == __n_readers_ - 1) + if (num_readers == __n_readers_ - 1) { + lk.unlock(); __gate1_.notify_one(); + } } } From f7a1264f0c569fde170d321b30715edd96cc0ff4 Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Mon, 13 Jan 2025 10:23:15 -0500 Subject: [PATCH 069/102] [SPIRV] Return success when selecting reads and writes. (#122162) The function `selectImageWriteIntrinsic` and `selectReadImageIntrinsic` are void functions. The should return true if they succeed, and false otherwise. This commit updates the code to do this. --- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 72 +++++++++++-------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 28c9b81db51f5..b7b32dd0d626c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -274,10 +274,10 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - void selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType, + bool selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - void selectImageWriteIntrinsic(MachineInstr &I) const; + bool selectImageWriteIntrinsic(MachineInstr &I) const; // Utilities std::pair @@ -305,7 +305,7 @@ class SPIRVInstructionSelector : public InstructionSelector { Register IndexReg, bool IsNonUniform, MachineIRBuilder MIRBuilder) const; SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const; - void extractSubvector(Register &ResVReg, const SPIRVType *ResType, + bool extractSubvector(Register &ResVReg, const SPIRVType *ResType, Register &ReadReg, MachineInstr &InsertionPoint) const; bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const; bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue, @@ -3002,12 +3002,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectHandleFromBinding(ResVReg, ResType, I); } case Intrinsic::spv_resource_store_typedbuffer: { - selectImageWriteIntrinsic(I); - return true; + return selectImageWriteIntrinsic(I); } case Intrinsic::spv_resource_load_typedbuffer: { - selectReadImageIntrinsic(ResVReg, ResType, I); - return true; + return selectReadImageIntrinsic(ResVReg, ResType, I); } case Intrinsic::spv_discard: { return selectDiscard(ResVReg, ResType, I); @@ -3049,7 +3047,7 @@ bool SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg, .constrainAllUses(TII, TRI, RBI); } -void SPIRVInstructionSelector::selectReadImageIntrinsic( +bool SPIRVInstructionSelector::selectReadImageIntrinsic( Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const { // If the load of the image is in a different basic block, then @@ -3064,35 +3062,40 @@ void SPIRVInstructionSelector::selectReadImageIntrinsic( uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType); if (ResultSize == 4) { - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead)) + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpImageRead)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) .addUse(ImageReg) - .addUse(I.getOperand(3).getReg()); - return; + .addUse(I.getOperand(3).getReg()) + .constrainAllUses(TII, TRI, RBI); } SPIRVType *ReadType = widenTypeToVec4(ResType, I); Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType)); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead)) - .addDef(ReadReg) - .addUse(GR.getSPIRVTypeID(ReadType)) - .addUse(ImageReg) - .addUse(I.getOperand(3).getReg()); + bool Succeed = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageRead)) + .addDef(ReadReg) + .addUse(GR.getSPIRVTypeID(ReadType)) + .addUse(ImageReg) + .addUse(I.getOperand(3).getReg()) + .constrainAllUses(TII, TRI, RBI); + if (!Succeed) + return false; if (ResultSize == 1) { - BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(SPIRV::OpCompositeExtract)) + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpCompositeExtract)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) .addUse(ReadReg) - .addImm(0); - return; + .addImm(0) + .constrainAllUses(TII, TRI, RBI); } - extractSubvector(ResVReg, ResType, ReadReg, I); + return extractSubvector(ResVReg, ResType, ReadReg, I); } -void SPIRVInstructionSelector::extractSubvector( +bool SPIRVInstructionSelector::extractSubvector( Register &ResVReg, const SPIRVType *ResType, Register &ReadReg, MachineInstr &InsertionPoint) const { SPIRVType *InputType = GR.getResultType(ReadReg); @@ -3108,12 +3111,16 @@ void SPIRVInstructionSelector::extractSubvector( const TargetRegisterClass *ScalarRegClass = GR.getRegClass(ScalarType); for (uint64_t I = 0; I < ResultSize; I++) { Register ComponentReg = MRI->createVirtualRegister(ScalarRegClass); - BuildMI(*InsertionPoint.getParent(), InsertionPoint, - InsertionPoint.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) - .addDef(ComponentReg) - .addUse(ScalarType->getOperand(0).getReg()) - .addUse(ReadReg) - .addImm(I); + bool Succeed = BuildMI(*InsertionPoint.getParent(), InsertionPoint, + InsertionPoint.getDebugLoc(), + TII.get(SPIRV::OpCompositeExtract)) + .addDef(ComponentReg) + .addUse(ScalarType->getOperand(0).getReg()) + .addUse(ReadReg) + .addImm(I) + .constrainAllUses(TII, TRI, RBI); + if (!Succeed) + return false; ComponentRegisters.emplace_back(ComponentReg); } @@ -3125,9 +3132,10 @@ void SPIRVInstructionSelector::extractSubvector( for (Register ComponentReg : ComponentRegisters) MIB.addUse(ComponentReg); + return MIB.constrainAllUses(TII, TRI, RBI); } -void SPIRVInstructionSelector::selectImageWriteIntrinsic( +bool SPIRVInstructionSelector::selectImageWriteIntrinsic( MachineInstr &I) const { // If the load of the image is in a different basic block, then // this will generate invalid code. A proper solution is to move @@ -3142,10 +3150,12 @@ void SPIRVInstructionSelector::selectImageWriteIntrinsic( Register DataReg = I.getOperand(3).getReg(); assert(GR.getResultType(DataReg)->getOpcode() == SPIRV::OpTypeVector); assert(GR.getScalarOrVectorComponentCount(GR.getResultType(DataReg)) == 4); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpImageWrite)) + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpImageWrite)) .addUse(ImageReg) .addUse(CoordinateReg) - .addUse(DataReg); + .addUse(DataReg) + .constrainAllUses(TII, TRI, RBI); } Register SPIRVInstructionSelector::buildPointerToResource( From 9f76de2e263241d6220be581449680deebae71c5 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 13 Jan 2025 06:48:56 -0800 Subject: [PATCH 070/102] [SLP]Do not include subvectors for fully matched buildvectors If the buildvector node fully matched another node, need to exclude subvectors, when building final shuffle, just a shuffle of the original node must be emitted. Fixes #122584 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 4 +- .../vectorizable-selects-uniform-cmps.ll | 4 +- .../X86/full-matched-bv-with-subvectors.ll | 99 +++++++++++++++++++ 3 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e3487b5015342..df46c69ff3ab4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14935,8 +14935,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors, - SubVectorsMask); + // Full matched entry found, no need to insert subvectors. + Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {}); return Res; } if (!Resized) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index f9e415a3cefc1..27f3155b50dbb 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -259,11 +259,9 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12) ; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], splat (i8 -1) -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP8]], <8 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP10]], <4 x i8> [[TMP3]], i64 12) ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP11]], <16 x i8> [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP13]] ; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll new file mode 100644 index 0000000000000..7576eb7a8f55e --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-9999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test(i64 %l.549) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: i64 [[L_549:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CONV3:%.*]] = sext i32 0 to i64 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3 +; CHECK-NEXT: br label %[[IF_THEN19:.*]] +; CHECK: [[P:.*]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: br i1 false, label %[[S:.*]], label %[[Q:.*]] +; CHECK: [[Q]]: +; CHECK-NEXT: [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[TMP3]], i64 0) +; CHECK-NEXT: br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]] +; CHECK: [[LOR_LHS_FALSE]]: +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]] +; CHECK: [[R]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP5]], %[[Q]] ], [ [[TMP16:%.*]], %[[IF_THEN19]] ] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]] +; CHECK: [[LAND_LHS_TRUE]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i64> [ [[TMP8]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ] +; CHECK-NEXT: br i1 false, label %[[Q]], label %[[S]] +; CHECK: [[S]]: +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP9]], %[[LAND_LHS_TRUE]] ], [ [[TMP8]], %[[R]] ], [ [[TMP6]], %[[LOR_LHS_FALSE]] ], [ [[TMP2]], %[[P]] ] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: br label %[[IF_THEN19]] +; CHECK: [[IF_THEN19]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11]], %[[S]] ] +; CHECK-NEXT: [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[L_549]], i32 1 +; CHECK-NEXT: [[TMP16]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP15]], <2 x i64> zeroinitializer, i64 2) +; CHECK-NEXT: br i1 false, label %[[R]], label %[[IF_END25]] +; CHECK: [[IF_END25]]: +; CHECK-NEXT: br i1 false, label %[[IF_END29]], label %[[P]] +; CHECK: [[IF_END29]]: +; CHECK-NEXT: br label %[[P]] +; +entry: + %conv3 = sext i32 0 to i64 + br label %if.then19 + +p: + %l.0 = phi i64 [ %xor, %if.end29 ], [ %l.5493, %if.end25 ] + %m.0 = phi i64 [ %not21, %if.end29 ], [ %m.550, %if.end25 ] + br i1 false, label %s, label %q + +q: + %xor39 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ] + %l.1 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ] + %m.1 = phi i64 [ 0, %p ], [ 0, %land.lhs.true ] + br i1 false, label %lor.lhs.false, label %r + +lor.lhs.false: + br i1 false, label %land.lhs.true, label %s + +r: + %xor38 = phi i64 [ %xor39, %q ], [ %xor, %if.then19 ] + %j.0 = phi i64 [ %conv3, %q ], [ %not21, %if.then19 ] + %l.2 = phi i64 [ %l.1, %q ], [ %l.549, %if.then19 ] + %m.2 = phi i64 [ %m.1, %q ], [ %m.550, %if.then19 ] + br i1 false, label %s, label %land.lhs.true + +land.lhs.true: + %xor37 = phi i64 [ %xor38, %r ], [ 0, %lor.lhs.false ] + %j.1 = phi i64 [ %j.0, %r ], [ 0, %lor.lhs.false ] + %l.3 = phi i64 [ %l.2, %r ], [ 0, %lor.lhs.false ] + %m.3 = phi i64 [ %m.2, %r ], [ 0, %lor.lhs.false ] + br i1 false, label %q, label %s + +s: + %xor36 = phi i64 [ %xor37, %land.lhs.true ], [ %xor38, %r ], [ %xor39, %lor.lhs.false ], [ %l.0, %p ] + %j.2 = phi i64 [ %j.1, %land.lhs.true ], [ %j.0, %r ], [ %conv3, %lor.lhs.false ], [ %m.0, %p ] + %l.4 = phi i64 [ %l.3, %land.lhs.true ], [ %l.2, %r ], [ %l.1, %lor.lhs.false ], [ %l.0, %p ] + %m.4 = phi i64 [ %m.3, %land.lhs.true ], [ %m.2, %r ], [ %m.1, %lor.lhs.false ], [ %m.0, %p ] + br label %if.then19 + +if.then19: + %m.550 = phi i64 [ 0, %entry ], [ %m.4, %s ] + %l.5493 = phi i64 [ 0, %entry ], [ %l.4, %s ] + %xor = xor i64 0, 0 + %not21 = xor i64 0, 0 + br i1 false, label %r, label %if.end25 + +if.end25: + br i1 false, label %if.end29, label %p + +if.end29: + br label %p +} + From e70f943d576aa5425ec38b7ec9ab802af1e732f5 Mon Sep 17 00:00:00 2001 From: gbMattN <146744444+gbMattN@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:28:37 +0000 Subject: [PATCH 071/102] [TySan] Fix struct access with different bases (#120412) Original pull request [here](https://github.com/llvm/llvm-project/pull/108385) Fixes issue https://github.com/llvm/llvm-project/issues/105960 If a member in a struct is also a struct, accessing a member partway through this inner struct currently causes a false positive. This is because when checking aliasing, the access offset is seen as greater than the starting offset of the inner struct, so the loop continues one iteration, and believes we are accessing the member after the inner struct. The next member's offset is greater than the offset we are looking for, so when we subtract the next member's offset from what we are looking for, the offset underflows. To fix this, we check if the member we think we are accessing has a greater offset than the offset we are looking for. If so, we take a step back. We cannot do this in the loop, since the loop does not check the final member. This means the penultimate member would still cause false positives. --- compiler-rt/lib/tysan/tysan.cpp | 11 +++++ .../tysan/struct-offset-different-base.cpp | 49 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 compiler-rt/test/tysan/struct-offset-different-base.cpp diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp index 9c87b4782671a..f0230df9260e3 100644 --- a/compiler-rt/lib/tysan/tysan.cpp +++ b/compiler-rt/lib/tysan/tysan.cpp @@ -131,6 +131,17 @@ static bool isAliasingLegalUp(tysan_type_descriptor *TDA, break; } + // This offset can't be negative. Therefore we must be accessing something + // before the current type (not legal) or partially inside the last type. + // In the latter case, we adjust Idx. + if (TDA->Struct.Members[Idx].Offset > OffsetA) { + // Trying to access something before the current type. + if (!Idx) + return false; + + Idx -= 1; + } + OffsetA -= TDA->Struct.Members[Idx].Offset; TDA = TDA->Struct.Members[Idx].Type; } else { diff --git a/compiler-rt/test/tysan/struct-offset-different-base.cpp b/compiler-rt/test/tysan/struct-offset-different-base.cpp new file mode 100644 index 0000000000000..862595de8dc81 --- /dev/null +++ b/compiler-rt/test/tysan/struct-offset-different-base.cpp @@ -0,0 +1,49 @@ +// RUN: %clangxx_tysan -O0 %s -o %t && %run %t >%t.out 2>&1 +// RUN: FileCheck %s --implicit-check-not ERROR < %t.out + +// Modified reproducer from https://github.com/llvm/llvm-project/issues/105960 + +#include + +struct inner1 { + char buffer; + int i; +}; + +struct inner2 { + char buffer; + int i; + float endBuffer; +}; + +void init_inner1(inner1 *iPtr) { iPtr->i = 200; } +void init_inner2(inner2 *iPtr) { + iPtr->i = 400; + iPtr->endBuffer = 413.0f; +} + +struct outer { + inner1 foo; + inner2 bar; + char buffer; +}; + +int main(void) { + outer *l = new outer(); + + init_inner1(&l->foo); + init_inner2(&l->bar); + + int access = l->foo.i; + printf("Accessed value 1 is %d\n", access); + access = l->bar.i; + printf("Accessed value 2 is %d\n", access); + float fAccess = l->bar.endBuffer; + printf("Accessed value 3 is %f\n", fAccess); + + return 0; +} + +// CHECK: Accessed value 1 is 200 +// CHECK: Accessed value 2 is 400 +// CHECK: Accessed value 3 is 413.0 From 1157ed9e9e108a710bf300a4135494b71f923284 Mon Sep 17 00:00:00 2001 From: goldsteinn <35538541+goldsteinn@users.noreply.github.com> Date: Mon, 13 Jan 2025 09:38:09 -0600 Subject: [PATCH 072/102] [InstCombine] Fold `(ct{t,l}z Pow2)` -> `Log2(Pow2)` (#122620) - **[InstCombine] Add tests for folding `(ct{t,l}z Pow2)`; NFC** - **[InstCombine] Fold `(ct{t,l}z Pow2)` -> `Log2(Pow2)`** Do so we can find `Log2(Pow2)` for "free" with `takeLog2` https://alive2.llvm.org/ce/z/CL77fo --- .../InstCombine/InstCombineCalls.cpp | 13 +++ llvm/test/Transforms/InstCombine/cttz.ll | 93 +++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 7454382412369..dd5a4ba5a4724 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -588,6 +588,19 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { } } + // cttz(Pow2) -> Log2(Pow2) + // ctlz(Pow2) -> BitWidth - 1 - Log2(Pow2) + if (auto *R = IC.tryGetLog2(Op0, match(Op1, m_One()))) { + if (IsTZ) + return IC.replaceInstUsesWith(II, R); + BinaryOperator *BO = BinaryOperator::CreateSub( + ConstantInt::get(R->getType(), R->getType()->getScalarSizeInBits() - 1), + R); + BO->setHasNoSignedWrap(); + BO->setHasNoUnsignedWrap(); + return BO; + } + KnownBits Known = IC.computeKnownBits(Op0, 0, &II); // Create a mask for bits above (ctlz) or below (cttz) the first known one. diff --git a/llvm/test/Transforms/InstCombine/cttz.ll b/llvm/test/Transforms/InstCombine/cttz.ll index cb0bc59ae7995..829213b24e93e 100644 --- a/llvm/test/Transforms/InstCombine/cttz.ll +++ b/llvm/test/Transforms/InstCombine/cttz.ll @@ -297,3 +297,96 @@ define i16 @cttz_assume(i16 %x) { %cttz = call i16 @llvm.cttz.i16(i16 %x, i1 false) ret i16 %cttz } + + +declare void @use.i8(i8) +define i8 @fold_ctz_log2(i8 %x) { +; CHECK-LABEL: @fold_ctz_log2( +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 5) +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl i8 1, %x + %v = call i8 @llvm.umin(i8 %p2, i8 32) + %r = call i8 @llvm.cttz(i8 %v, i1 false) + ret i8 %r +} + +define i9 @fold_ctz_log2_i9_okay(i9 %x) { +; CHECK-LABEL: @fold_ctz_log2_i9_okay( +; CHECK-NEXT: [[R:%.*]] = call i9 @llvm.umin.i9(i9 [[X:%.*]], i9 5) +; CHECK-NEXT: ret i9 [[R]] +; + %p2 = shl i9 1, %x + %v = call i9 @llvm.umin(i9 %p2, i9 32) + %r = call i9 @llvm.cttz(i9 %v, i1 false) + ret i9 %r +} + +define i8 @fold_ctz_log2_maybe_z(i8 %x, i8 %y, i1 %c) { +; CHECK-LABEL: @fold_ctz_log2_maybe_z( +; CHECK-NEXT: [[V:%.*]] = shl i8 2, [[V_V:%.*]] +; CHECK-NEXT: [[P2_2:%.*]] = shl i8 4, [[Y:%.*]] +; CHECK-NEXT: [[V1:%.*]] = select i1 [[C:%.*]], i8 [[V]], i8 [[P2_2]] +; CHECK-NEXT: [[R:%.*]] = call range(i8 1, 9) i8 @llvm.cttz.i8(i8 [[V1]], i1 false) +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl i8 2, %x + %p2_2 = shl i8 4, %y + %v = select i1 %c, i8 %p2, i8 %p2_2 + %r = call i8 @llvm.cttz(i8 %v, i1 false) + ret i8 %r +} + +define i8 @fold_ctz_log2_maybe_z_okay(i8 %x, i8 %y, i1 %c) { +; CHECK-LABEL: @fold_ctz_log2_maybe_z_okay( +; CHECK-NEXT: [[X:%.*]] = add i8 [[X1:%.*]], 1 +; CHECK-NEXT: [[Y:%.*]] = add i8 [[Y1:%.*]], 2 +; CHECK-NEXT: [[V_V:%.*]] = select i1 [[C:%.*]], i8 [[X]], i8 [[Y]] +; CHECK-NEXT: ret i8 [[V_V]] +; + %p2 = shl i8 2, %x + %p2_2 = shl i8 4, %y + %v = select i1 %c, i8 %p2, i8 %p2_2 + %r = call i8 @llvm.cttz(i8 %v, i1 true) + ret i8 %r +} + +define i8 @fold_clz_log2(i8 %x) { +; CHECK-LABEL: @fold_clz_log2( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 5) +; CHECK-NEXT: [[R:%.*]] = xor i8 [[TMP1]], 7 +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl i8 1, %x + %v = call i8 @llvm.umin(i8 %p2, i8 32) + %r = call i8 @llvm.ctlz(i8 %v, i1 false) + ret i8 %r +} + +define i8 @fold_clz_log2_multiuse_fail(i8 %x) { +; CHECK-LABEL: @fold_clz_log2_multiuse_fail( +; CHECK-NEXT: [[P2:%.*]] = shl nuw i8 2, [[X:%.*]] +; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.umin.i8(i8 [[P2]], i8 32) +; CHECK-NEXT: call void @use.i8(i8 [[V]]) +; CHECK-NEXT: [[R:%.*]] = call range(i8 2, 9) i8 @llvm.ctlz.i8(i8 [[V]], i1 true) +; CHECK-NEXT: ret i8 [[R]] +; + %p2 = shl nuw i8 2, %x + %v = call i8 @llvm.umin(i8 %p2, i8 32) + call void @use.i8(i8 %v) + %r = call i8 @llvm.ctlz(i8 %v, i1 true) + ret i8 %r +} + + +define i9 @fold_clz_log2_i9(i9 %x) { +; CHECK-LABEL: @fold_clz_log2_i9( +; CHECK-NEXT: [[TMP1:%.*]] = call i9 @llvm.umin.i9(i9 [[X:%.*]], i9 5) +; CHECK-NEXT: [[R:%.*]] = sub nuw nsw i9 8, [[TMP1]] +; CHECK-NEXT: ret i9 [[R]] +; + %p2 = shl i9 1, %x + %v = call i9 @llvm.umin(i9 %p2, i9 32) + %r = call i9 @llvm.ctlz(i9 %v, i1 true) + ret i9 %r +} From 98ad3e2e4c5aeadd33514b6f953c091d5251f03f Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Mon, 13 Jan 2025 10:49:25 -0500 Subject: [PATCH 073/102] [mlir python] Add locking around PyMlirContext::liveOperations. (#122720) In JAX, I observed a race between two PyOperation destructors from different threads updating the same `liveOperations` map, despite not intentionally sharing the context between different threads. Since I don't think we can be completely sure when GC happens and on which thread, it seems safest simply to add locking here. We may also want to explicitly support sharing a context between threads in the future, which would require this change or something similar. --- mlir/lib/Bindings/Python/IRCore.cpp | 43 +++++++++++++++++++++-------- mlir/lib/Bindings/Python/IRModule.h | 3 ++ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 463ebdebb3f3f..53806ca9f04a4 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -677,29 +677,44 @@ size_t PyMlirContext::getLiveCount() { return getLiveContexts().size(); } -size_t PyMlirContext::getLiveOperationCount() { return liveOperations.size(); } +size_t PyMlirContext::getLiveOperationCount() { + nb::ft_lock_guard lock(liveOperationsMutex); + return liveOperations.size(); +} std::vector PyMlirContext::getLiveOperationObjects() { std::vector liveObjects; + nb::ft_lock_guard lock(liveOperationsMutex); for (auto &entry : liveOperations) liveObjects.push_back(entry.second.second); return liveObjects; } size_t PyMlirContext::clearLiveOperations() { - for (auto &op : liveOperations) + + LiveOperationMap operations; + { + nb::ft_lock_guard lock(liveOperationsMutex); + std::swap(operations, liveOperations); + } + for (auto &op : operations) op.second.second->setInvalid(); - size_t numInvalidated = liveOperations.size(); - liveOperations.clear(); + size_t numInvalidated = operations.size(); return numInvalidated; } void PyMlirContext::clearOperation(MlirOperation op) { - auto it = liveOperations.find(op.ptr); - if (it != liveOperations.end()) { - it->second.second->setInvalid(); + PyOperation *py_op; + { + nb::ft_lock_guard lock(liveOperationsMutex); + auto it = liveOperations.find(op.ptr); + if (it == liveOperations.end()) { + return; + } + py_op = it->second.second; liveOperations.erase(it); } + py_op->setInvalid(); } void PyMlirContext::clearOperationsInside(PyOperationBase &op) { @@ -1183,7 +1198,6 @@ PyOperation::~PyOperation() { PyOperationRef PyOperation::createInstance(PyMlirContextRef contextRef, MlirOperation operation, nb::object parentKeepAlive) { - auto &liveOperations = contextRef->liveOperations; // Create. PyOperation *unownedOperation = new PyOperation(std::move(contextRef), operation); @@ -1195,19 +1209,22 @@ PyOperationRef PyOperation::createInstance(PyMlirContextRef contextRef, if (parentKeepAlive) { unownedOperation->parentKeepAlive = std::move(parentKeepAlive); } - liveOperations[operation.ptr] = std::make_pair(pyRef, unownedOperation); return PyOperationRef(unownedOperation, std::move(pyRef)); } PyOperationRef PyOperation::forOperation(PyMlirContextRef contextRef, MlirOperation operation, nb::object parentKeepAlive) { + nb::ft_lock_guard lock(contextRef->liveOperationsMutex); auto &liveOperations = contextRef->liveOperations; auto it = liveOperations.find(operation.ptr); if (it == liveOperations.end()) { // Create. - return createInstance(std::move(contextRef), operation, - std::move(parentKeepAlive)); + PyOperationRef result = createInstance(std::move(contextRef), operation, + std::move(parentKeepAlive)); + liveOperations[operation.ptr] = + std::make_pair(result.getObject(), result.get()); + return result; } // Use existing. PyOperation *existing = it->second.second; @@ -1218,13 +1235,15 @@ PyOperationRef PyOperation::forOperation(PyMlirContextRef contextRef, PyOperationRef PyOperation::createDetached(PyMlirContextRef contextRef, MlirOperation operation, nb::object parentKeepAlive) { + nb::ft_lock_guard lock(contextRef->liveOperationsMutex); auto &liveOperations = contextRef->liveOperations; assert(liveOperations.count(operation.ptr) == 0 && "cannot create detached operation that already exists"); (void)liveOperations; - PyOperationRef created = createInstance(std::move(contextRef), operation, std::move(parentKeepAlive)); + liveOperations[operation.ptr] = + std::make_pair(created.getObject(), created.get()); created->attached = false; return created; } diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index f5fbb6c61b57e..d1fb4308dbb77 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -277,6 +277,9 @@ class PyMlirContext { // attempt to access it will raise an error. using LiveOperationMap = llvm::DenseMap>; + nanobind::ft_mutex liveOperationsMutex; + + // Guarded by liveOperationsMutex in free-threading mode. LiveOperationMap liveOperations; bool emitErrorDiagnostics = false; From b0ef2d203df99db3e49eb18d8e23379273e5af50 Mon Sep 17 00:00:00 2001 From: Kelvin Li Date: Mon, 13 Jan 2025 10:52:09 -0500 Subject: [PATCH 074/102] [flang][AIX] BIND(C) derived type alignment for AIX (#121505) This patch is to handle the alignment requirement for the `bind(c)` derived type component that is real type and larger than 4 bytes. The alignment of such component is 4-byte. --- .../flang/Optimizer/CodeGen/TypeConverter.h | 2 +- .../flang/Optimizer/Dialect/FIRTypes.td | 6 + flang/lib/Lower/ConvertType.cpp | 44 ++++++++ .../lib/Optimizer/CodeGen/BoxedProcedure.cpp | 1 + flang/lib/Optimizer/CodeGen/TypeConverter.cpp | 10 +- flang/lib/Optimizer/Dialect/FIRType.cpp | 31 +++++- flang/lib/Semantics/compute-offsets.cpp | 88 ++++++++++++++- flang/test/Lower/CUDA/cuda-devptr.cuf | 4 +- .../test/Lower/HLFIR/bindc-value-derived.f90 | 18 +-- flang/test/Lower/OpenMP/copyin.f90 | 2 +- flang/test/Lower/derived-types-bindc.f90 | 44 ++++++++ flang/test/Lower/intentout-deallocate.f90 | 20 ++-- flang/test/Semantics/offsets04.f90 | 105 ++++++++++++++++++ 13 files changed, 340 insertions(+), 35 deletions(-) create mode 100644 flang/test/Lower/derived-types-bindc.f90 create mode 100644 flang/test/Semantics/offsets04.f90 diff --git a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h index 7c317ddeea1fa..20270d41b1e9a 100644 --- a/flang/include/flang/Optimizer/CodeGen/TypeConverter.h +++ b/flang/include/flang/Optimizer/CodeGen/TypeConverter.h @@ -62,7 +62,7 @@ class LLVMTypeConverter : public mlir::LLVMTypeConverter { // fir.type --> llvm<"%name = { ty... }"> std::optional convertRecordType(fir::RecordType derived, - llvm::SmallVectorImpl &results); + llvm::SmallVectorImpl &results, bool isPacked); // Is an extended descriptor needed given the element type of a fir.box type ? // Extended descriptors are required for derived types. diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td index 3919c9191c212..6ae74f16a72d3 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td +++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td @@ -346,6 +346,12 @@ def fir_RecordType : FIR_Type<"Record", "type"> { void finalize(llvm::ArrayRef lenPList, llvm::ArrayRef typeList); + // fir.type is unpacked by default. If the flag is set, the packed fir.type + // is generated and the alignment is enforced by explicit padding by i8 + // array fields. + bool isPacked() const; + void pack(bool); + detail::RecordTypeStorage const *uniqueKey() const; }]; } diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp index 452ddda426fa1..31b85ef2b5476 100644 --- a/flang/lib/Lower/ConvertType.cpp +++ b/flang/lib/Lower/ConvertType.cpp @@ -20,6 +20,8 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypes.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" #define DEBUG_TYPE "flang-lower-type" @@ -385,9 +387,20 @@ struct TypeBuilderImpl { // with dozens of components/parents (modern Fortran). derivedTypeInConstruction.try_emplace(&derivedScope, rec); + auto targetTriple{llvm::Triple( + llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()))}; + // Always generate packed FIR struct type for bind(c) derived type for AIX + if (targetTriple.getOS() == llvm::Triple::OSType::AIX && + tySpec.typeSymbol().attrs().test(Fortran::semantics::Attr::BIND_C) && + !IsIsoCType(&tySpec)) { + rec.pack(true); + } + // Gather the record type fields. // (1) The data components. if (converter.getLoweringOptions().getLowerToHighLevelFIR()) { + size_t prev_offset{0}; + unsigned padCounter{0}; // In HLFIR the parent component is the first fir.type component. for (const auto &componentName : typeSymbol.get() @@ -397,7 +410,38 @@ struct TypeBuilderImpl { "failed to find derived type component symbol"); const Fortran::semantics::Symbol &component = scopeIter->second.get(); mlir::Type ty = genSymbolType(component); + if (rec.isPacked()) { + auto compSize{component.size()}; + auto compOffset{component.offset()}; + + if (prev_offset < compOffset) { + size_t pad{compOffset - prev_offset}; + mlir::Type i8Ty{mlir::IntegerType::get(context, 8)}; + fir::SequenceType::Shape shape{static_cast(pad)}; + mlir::Type padTy{fir::SequenceType::get(shape, i8Ty)}; + prev_offset += pad; + cs.emplace_back("__padding" + std::to_string(padCounter++), padTy); + } + prev_offset += compSize; + } cs.emplace_back(converter.getRecordTypeFieldName(component), ty); + if (rec.isPacked()) { + // For the last component, determine if any padding is needed. + if (componentName == + typeSymbol.get() + .componentNames() + .back()) { + auto compEnd{component.offset() + component.size()}; + if (compEnd < derivedScope.size()) { + size_t pad{derivedScope.size() - compEnd}; + mlir::Type i8Ty{mlir::IntegerType::get(context, 8)}; + fir::SequenceType::Shape shape{static_cast(pad)}; + mlir::Type padTy{fir::SequenceType::get(shape, i8Ty)}; + cs.emplace_back("__padding" + std::to_string(padCounter++), + padTy); + } + } + } } } else { for (const auto &component : diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp index 104ae7408b80c..ad7272eaa9d3f 100644 --- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp +++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp @@ -167,6 +167,7 @@ class BoxprocTypeRewriter : public mlir::TypeConverter { cs.emplace_back(t.first, t.second); } rec.finalize(ps, cs); + rec.pack(ty.isPacked()); return rec; }); addConversion([&](TypeDescType ty) { diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp index c23203efcd3df..0eace903720f0 100644 --- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp +++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp @@ -82,7 +82,7 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA, [&](fir::PointerType pointer) { return convertPointerLike(pointer); }); addConversion( [&](fir::RecordType derived, llvm::SmallVectorImpl &results) { - return convertRecordType(derived, results); + return convertRecordType(derived, results, derived.isPacked()); }); addConversion( [&](fir::ReferenceType ref) { return convertPointerLike(ref); }); @@ -133,8 +133,10 @@ mlir::Type LLVMTypeConverter::indexType() const { } // fir.type --> llvm<"%name = { ty... }"> -std::optional LLVMTypeConverter::convertRecordType( - fir::RecordType derived, llvm::SmallVectorImpl &results) { +std::optional +LLVMTypeConverter::convertRecordType(fir::RecordType derived, + llvm::SmallVectorImpl &results, + bool isPacked) { auto name = fir::NameUniquer::dropTypeConversionMarkers(derived.getName()); auto st = mlir::LLVM::LLVMStructType::getIdentified(&getContext(), name); @@ -156,7 +158,7 @@ std::optional LLVMTypeConverter::convertRecordType( else members.push_back(mlir::cast(convertType(mem.second))); } - if (mlir::failed(st.setBody(members, /*isPacked=*/false))) + if (mlir::failed(st.setBody(members, isPacked))) return mlir::failure(); results.push_back(st); return mlir::success(); diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp index cba7fa6412850..d25e5651f1142 100644 --- a/flang/lib/Optimizer/Dialect/FIRType.cpp +++ b/flang/lib/Optimizer/Dialect/FIRType.cpp @@ -165,16 +165,20 @@ struct RecordTypeStorage : public mlir::TypeStorage { setTypeList(typeList); } + bool isPacked() const { return packed; } + void pack(bool p) { packed = p; } + protected: std::string name; bool finalized; + bool packed; std::vector lens; std::vector types; private: RecordTypeStorage() = delete; explicit RecordTypeStorage(llvm::StringRef name) - : name{name}, finalized{false} {} + : name{name}, finalized{false}, packed{false} {} }; } // namespace detail @@ -872,9 +876,14 @@ llvm::LogicalResult fir::PointerType::verify( //===----------------------------------------------------------------------===// // Fortran derived type +// unpacked: // `type` `<` name // (`(` id `:` type (`,` id `:` type)* `)`)? // (`{` id `:` type (`,` id `:` type)* `}`)? '>' +// packed: +// `type` `<` name +// (`(` id `:` type (`,` id `:` type)* `)`)? +// (`<{` id `:` type (`,` id `:` type)* `}>`)? '>' mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) { llvm::StringRef name; if (parser.parseLess() || parser.parseKeyword(&name)) @@ -900,6 +909,10 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) { } RecordType::TypeList typeList; + if (!parser.parseOptionalLess()) { + result.pack(true); + } + if (!parser.parseOptionalLBrace()) { while (true) { llvm::StringRef field; @@ -913,8 +926,10 @@ mlir::Type fir::RecordType::parse(mlir::AsmParser &parser) { if (parser.parseOptionalComma()) break; } - if (parser.parseRBrace()) - return {}; + if (parser.parseOptionalGreater()) { + if (parser.parseRBrace()) + return {}; + } } if (parser.parseGreater()) @@ -941,6 +956,9 @@ void fir::RecordType::print(mlir::AsmPrinter &printer) const { printer << ')'; } if (getTypeList().size()) { + if (isPacked()) { + printer << '<'; + } char ch = '{'; for (auto p : getTypeList()) { printer << ch << p.first << ':'; @@ -948,6 +966,9 @@ void fir::RecordType::print(mlir::AsmPrinter &printer) const { ch = ','; } printer << '}'; + if (isPacked()) { + printer << '>'; + } } recordTypeVisited.erase(uniqueKey()); } @@ -973,6 +994,10 @@ RecordType::TypeList fir::RecordType::getLenParamList() const { bool fir::RecordType::isFinalized() const { return getImpl()->isFinalized(); } +void fir::RecordType::pack(bool p) { getImpl()->pack(p); } + +bool fir::RecordType::isPacked() const { return getImpl()->isPacked(); } + detail::RecordTypeStorage const *fir::RecordType::uniqueKey() const { return getImpl(); } diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp index 94640fa30baa5..6d4fce2f00a6d 100644 --- a/flang/lib/Semantics/compute-offsets.cpp +++ b/flang/lib/Semantics/compute-offsets.cpp @@ -17,6 +17,8 @@ #include "flang/Semantics/symbol.h" #include "flang/Semantics/tools.h" #include "flang/Semantics/type.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/Triple.h" #include #include @@ -51,9 +53,12 @@ class ComputeOffsetsHelper { SymbolAndOffset Resolve(const SymbolAndOffset &); std::size_t ComputeOffset(const EquivalenceObject &); // Returns amount of padding that was needed for alignment - std::size_t DoSymbol(Symbol &); + std::size_t DoSymbol( + Symbol &, std::optional newAlign = std::nullopt); SizeAndAlignment GetSizeAndAlignment(const Symbol &, bool entire); std::size_t Align(std::size_t, std::size_t); + std::optional CompAlignment(const Symbol &); + std::optional HasSpecialAlign(const Symbol &, Scope &); SemanticsContext &context_; std::size_t offset_{0}; @@ -65,6 +70,69 @@ class ComputeOffsetsHelper { equivalenceBlock_; }; +// This function is only called if the target platform is AIX. +static bool isReal8OrLarger(const Fortran::semantics::DeclTypeSpec *type) { + return ((type->IsNumeric(common::TypeCategory::Real) || + type->IsNumeric(common::TypeCategory::Complex)) && + evaluate::ToInt64(type->numericTypeSpec().kind()) > 4); +} + +// This function is only called if the target platform is AIX. +// It determines the alignment of a component. If the component is a derived +// type, the alignment is computed accordingly. +std::optional ComputeOffsetsHelper::CompAlignment(const Symbol &sym) { + size_t max_align{0}; + constexpr size_t fourByteAlign{4}; + bool contain_double{false}; + auto derivedTypeSpec{sym.GetType()->AsDerived()}; + DirectComponentIterator directs{*derivedTypeSpec}; + for (auto it{directs.begin()}; it != directs.end(); ++it) { + auto type{it->GetType()}; + auto s{GetSizeAndAlignment(*it, true)}; + if (isReal8OrLarger(type)) { + max_align = std::max(max_align, fourByteAlign); + contain_double = true; + } else if (type->AsDerived()) { + if (const auto newAlgin{CompAlignment(*it)}) { + max_align = std::max(max_align, s.alignment); + } else { + return std::nullopt; + } + } else { + max_align = std::max(max_align, s.alignment); + } + } + + if (contain_double) { + return max_align; + } else { + return std::nullopt; + } +} + +// This function is only called if the target platform is AIX. +// Special alignment is needed only if it is a bind(c) derived type +// and contain real type components that have larger than 4 bytes. +std::optional ComputeOffsetsHelper::HasSpecialAlign( + const Symbol &sym, Scope &scope) { + // On AIX, if the component that is not the first component and is + // a float of 8 bytes or larger, it has the 4-byte alignment. + // Only set the special alignment for bind(c) derived type on that platform. + if (const auto type{sym.GetType()}) { + auto &symOwner{sym.owner()}; + if (symOwner.symbol() && symOwner.IsDerivedType() && + symOwner.symbol()->attrs().HasAny({semantics::Attr::BIND_C}) && + &sym != &(*scope.GetSymbols().front())) { + if (isReal8OrLarger(type)) { + return 4UL; + } else if (type->AsDerived()) { + return CompAlignment(sym); + } + } + } + return std::nullopt; +} + void ComputeOffsetsHelper::Compute(Scope &scope) { for (Scope &child : scope.children()) { ComputeOffsets(context_, child); @@ -113,7 +181,15 @@ void ComputeOffsetsHelper::Compute(Scope &scope) { if (!FindCommonBlockContaining(*symbol) && dependents_.find(symbol) == dependents_.end() && equivalenceBlock_.find(symbol) == equivalenceBlock_.end()) { - DoSymbol(*symbol); + + std::optional newAlign{std::nullopt}; + // Handle special alignment requirement for AIX + auto triple{llvm::Triple( + llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()))}; + if (triple.getOS() == llvm::Triple::OSType::AIX) { + newAlign = HasSpecialAlign(*symbol, scope); + } + DoSymbol(*symbol, newAlign); if (auto *generic{symbol->detailsIf()}) { if (Symbol * specific{generic->specific()}; specific && !FindCommonBlockContaining(*specific)) { @@ -313,7 +389,8 @@ std::size_t ComputeOffsetsHelper::ComputeOffset( return result; } -std::size_t ComputeOffsetsHelper::DoSymbol(Symbol &symbol) { +std::size_t ComputeOffsetsHelper::DoSymbol( + Symbol &symbol, std::optional newAlign) { if (!symbol.has() && !symbol.has()) { return 0; } @@ -322,12 +399,13 @@ std::size_t ComputeOffsetsHelper::DoSymbol(Symbol &symbol) { return 0; } std::size_t previousOffset{offset_}; - offset_ = Align(offset_, s.alignment); + size_t alignVal{newAlign.value_or(s.alignment)}; + offset_ = Align(offset_, alignVal); std::size_t padding{offset_ - previousOffset}; symbol.set_size(s.size); symbol.set_offset(offset_); offset_ += s.size; - alignment_ = std::max(alignment_, s.alignment); + alignment_ = std::max(alignment_, alignVal); return padding; } diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf index 2eac890970d52..561d92ecd3e2e 100644 --- a/flang/test/Lower/CUDA/cuda-devptr.cuf +++ b/flang/test/Lower/CUDA/cuda-devptr.cuf @@ -38,8 +38,8 @@ end ! CHECK-LABEL: func.func @_QPsub2() ! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub2Ex"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) -! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}> -! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref}>>, !fir.field) -> !fir.ref> +! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{{[<]?}}{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}{{[>]?}}> +! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref}{{[>]?}}>>, !fir.field) -> !fir.ref> ! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> ! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref>, !fir.field) -> !fir.ref ! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref diff --git a/flang/test/Lower/HLFIR/bindc-value-derived.f90 b/flang/test/Lower/HLFIR/bindc-value-derived.f90 index 7a2196dfc8bf1..5af9f8edc804c 100644 --- a/flang/test/Lower/HLFIR/bindc-value-derived.f90 +++ b/flang/test/Lower/HLFIR/bindc-value-derived.f90 @@ -14,11 +14,11 @@ subroutine test(x) bind(c) call use_it(x%i) end subroutine ! CHECK-LABEL: func.func @test( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{i:i32}> -! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{i:i32}> -! CHECK: fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref> -! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) -! CHECK: %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"} : (!fir.ref>) -> !fir.ref +! CHECK-SAME: %[[VAL_0:.*]]: !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}> +! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}> +! CHECK: fir.store %[[VAL_0]] to %[[VAL_1]] : !fir.ref]?}}>> +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %{{[0-9]}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QMbindc_byvalFtestEx"} : (!fir.ref]?}}>>, !fir.dscope) -> (!fir.ref]?}}>>, !fir.ref]?}}>>) +! CHECK: %[[VAL_3:.*]] = hlfir.designate %[[VAL_2]]#0{"i"} : (!fir.ref]?}}>>) -> !fir.ref ! CHECK: fir.call @_QPuse_it(%[[VAL_3]]) fastmath : (!fir.ref) -> () ! CHECK: return ! CHECK: } @@ -28,10 +28,10 @@ subroutine call_it(x) call test(x) end subroutine ! CHECK-LABEL: func.func @_QMbindc_byvalPcall_it( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref> -! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) -! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref> -! CHECK: fir.call @test(%[[VAL_2]]) proc_attrs fastmath : (!fir.type<_QMbindc_byvalTt{i:i32}>) -> () +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref]?}}>> +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %{{[0-9]}} {uniq_name = "_QMbindc_byvalFcall_itEx"} : (!fir.ref]?}}>>, !fir.dscope) -> (!fir.ref]?}}>>, !fir.ref]?}}>>) +! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref]?}}>> +! CHECK: fir.call @test(%[[VAL_2]]) proc_attrs fastmath : (!fir.type<_QMbindc_byvalTt{{[<]?}}{i:i32}{{[>]?}}>) -> () ! CHECK: return ! CHECK: } end module diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90 index f3d147c10668f..9e9ccf8e3d914 100644 --- a/flang/test/Lower/OpenMP/copyin.f90 +++ b/flang/test/Lower/OpenMP/copyin.f90 @@ -86,7 +86,7 @@ subroutine copyin_char_chararray() end ! CHECK-LABEL: func.func @_QPcopyin_derived_type() { -! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeE.b.my_type.t_arr) : !fir.ref,value:i64}>>> +! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeE.b.my_type.t_arr) : !fir.ref,value:i64}{{[>]?}}>>> ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 2 : index ! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index diff --git a/flang/test/Lower/derived-types-bindc.f90 b/flang/test/Lower/derived-types-bindc.f90 new file mode 100644 index 0000000000000..309b2b7f5f492 --- /dev/null +++ b/flang/test/Lower/derived-types-bindc.f90 @@ -0,0 +1,44 @@ +! Test padding for BIND(C) derived types lowering for AIX target +! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck %s + +! REQUIRES: target={{.+}}-aix{{.*}} + +subroutine s1() + use, intrinsic :: iso_c_binding + type, bind(c) :: t0 + character(c_char) :: x1 + real(c_double) :: x2 + end type + type(t0) :: xt0 +! CHECK-DAG: %_QFs1Tt0 = type <{ [1 x i8], [3 x i8], double }> + + type, bind(c) :: t1 + integer(c_short) :: x1 + real(c_double) :: x2 + end type + type(t1) :: xt1 +! CHECK-DAG: %_QFs1Tt1 = type <{ i16, [2 x i8], double }> + + type, bind(c) :: t2 + integer(c_short) :: x1 + real(c_double) :: x2 + character(c_char) :: x3 + end type + type(t2) :: xt2 +! CHECK-DAG: %_QFs1Tt2 = type <{ i16, [2 x i8], double, [1 x i8], [3 x i8] }> + + type, bind(c) :: t3 + character(c_char) :: x1 + complex(c_double_complex) :: x2 + end type + type(t3) :: xt3 +! CHECK-DAG: %_QFs1Tt3 = type <{ [1 x i8], [3 x i8], { double, double } }> + + type, bind(c) :: t4 + integer(c_short) :: x1 + complex(c_double_complex) :: x2 + character(c_char) :: x3 + end type + type(t4) :: xt4 +! CHECK-DAG: %_QFs1Tt4 = type <{ i16, [2 x i8], { double, double }, [1 x i8], [3 x i8] }> +end subroutine s1 diff --git a/flang/test/Lower/intentout-deallocate.f90 b/flang/test/Lower/intentout-deallocate.f90 index 8e7ccbcc9fdb9..931cf7d48885f 100644 --- a/flang/test/Lower/intentout-deallocate.f90 +++ b/flang/test/Lower/intentout-deallocate.f90 @@ -123,24 +123,24 @@ subroutine sub5(t) ! on the caller side. ! CHECK-LABEL: func.func @_QMmod1Psub4() -! FIR: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "t", uniq_name = "_QMmod1Fsub4Et"} +! FIR: %[[BOX:.*]] = fir.alloca !fir.box]?}}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub4Et"} ! HLFIR: %[[BOX:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub4Et" ! CHECK-NOT: fir.call @_FortranAAllocatableDeallocate -! CHECK: fir.call @_QMmod1Psub5(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref>>>) -> () +! CHECK: fir.call @_QMmod1Psub5(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref]?}}>>>>) -> () ! Check deallocation of allocatble intent(out) on the callee side. Deallocation ! is done with a runtime call. ! CHECK-LABEL: func.func @_QMmod1Psub5( -! FIR-SAME: %[[ARG0:.*]]: !fir.ref>>> {fir.bindc_name = "t"}) +! FIR-SAME: %[[ARG0:.*]]: !fir.ref]?}}>>>> {fir.bindc_name = "t"}) ! HLFIR: %[[ARG0:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub5Et" -! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]]{{[#1]*}} : !fir.ref>>> -! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box>>) -> !fir.heap> -! CHECK: %[[BOX_ADDR_PTR:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.heap>) -> i64 +! CHECK: %[[BOX:.*]] = fir.load %[[ARG0]]{{[#1]*}} : !fir.ref]?}}>>>> +! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box]?}}>>>) -> !fir.heap]?}}>> +! CHECK: %[[BOX_ADDR_PTR:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.heap]?}}>>) -> i64 ! CHECK: %[[C0:.*]] = arith.constant 0 : i64 ! CHECK: %[[IS_ALLOCATED:.*]] = arith.cmpi ne, %[[BOX_ADDR_PTR]], %[[C0]] : i64 ! CHECK: fir.if %[[IS_ALLOCATED]] { -! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]]{{[#1]*}} : (!fir.ref>>>) -> !fir.ref> +! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]]{{[#1]*}} : (!fir.ref]?}}>>>>) -> !fir.ref> ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 subroutine sub6() @@ -152,11 +152,11 @@ subroutine sub6() ! Deallocation is done with a runtime call. ! CHECK-LABEL: func.func @_QMmod1Psub6() -! FIR: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "t", uniq_name = "_QMmod1Fsub6Et"} +! FIR: %[[BOX:.*]] = fir.alloca !fir.box]?}}>>> {bindc_name = "t", uniq_name = "_QMmod1Fsub6Et"} ! HLFIR: %[[BOX:.*]]:2 = hlfir.declare {{.*}}"_QMmod1Fsub6Et" -! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]]{{[#1]*}} : (!fir.ref>>>) -> !fir.ref> +! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]]{{[#1]*}} : (!fir.ref]?}}>>>>) -> !fir.ref> ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 -! CHECK: fir.call @sub7(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref>>>) -> () +! CHECK: fir.call @sub7(%[[BOX]]{{[#0]*}}) {{.*}}: (!fir.ref]?}}>>>>) -> () subroutine sub8() integer, allocatable :: a(:) diff --git a/flang/test/Semantics/offsets04.f90 b/flang/test/Semantics/offsets04.f90 new file mode 100644 index 0000000000000..d0d871a981c17 --- /dev/null +++ b/flang/test/Semantics/offsets04.f90 @@ -0,0 +1,105 @@ +!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s + +!REQUIRES: target={{.+}}-aix{{.*}} + +! Size and alignment of bind(c) derived types +subroutine s1() + use, intrinsic :: iso_c_binding + type, bind(c) :: dt1 + character(c_char) :: x1 !CHECK: x1 size=1 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=4: + end type + type, bind(c) :: dt2 + character(c_char) :: x1(9) !CHECK: x1 size=9 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=12: + end type + type, bind(c) :: dt3 + integer(c_short) :: x1 !CHECK: x1 size=2 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=4: + end type + type, bind(c) :: dt4 + integer(c_int) :: x1 !CHECK: x1 size=4 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=4: + end type + type, bind(c) :: dt5 + real(c_double) :: x1 !CHECK: x1 size=8 offset=0: + real(c_double) :: x2 !CHECK: x2 size=8 offset=8: + end type + type, bind(c) :: dt6 + integer(c_long) :: x1 !CHECK: x1 size=8 offset=0: + character(c_char) :: x2 !CHECK: x2 size=1 offset=8: + real(c_double) :: x3 !CHECK: x3 size=8 offset=12: + end type + type, bind(c) :: dt7 + integer(c_long) :: x1 !CHECK: x1 size=8 offset=0: + integer(c_long) :: x2 !CHECK: x2 size=8 offset=8: + character(c_char) :: x3 !CHECK: x3 size=1 offset=16: + real(c_double) :: x4 !CHECK: x4 size=8 offset=20: + end type + type, bind(c) :: dt8 + character(c_char) :: x1 !CHECK: x1 size=1 offset=0: + complex(c_double_complex) :: x2 !CHECK: x2 size=16 offset=4: + end type +end subroutine + +subroutine s2() + use, intrinsic :: iso_c_binding + type, bind(c) :: dt10 + character(c_char) :: x1 + real(c_double) :: x2 + end type + type, bind(c) :: dt11 + type(dt10) :: y1 !CHECK: y1 size=12 offset=0: + real(c_double) :: y2 !CHECK: y2 size=8 offset=12: + end type + type, bind(c) :: dt12 + character(c_char) :: y1 !CHECK: y1 size=1 offset=0: + type(dt10) :: y2 !CHECK: y2 size=12 offset=4: + character(c_char) :: y3 !CHECK: y3 size=1 offset=16: + end type + type, bind(c) :: dt13 + integer(c_short) :: y1 !CHECK: y1 size=2 offset=0: + type(dt10) :: y2 !CHECK: y2 size=12 offset=4: + character(c_char) :: y3 !CHECK: y3 size=1 offset=16: + end type + + type, bind(c) :: dt20 + character(c_char) :: x1 + integer(c_short) :: x2 + end type + type, bind(c) :: dt21 + real(c_double) :: y1 !CHECK: y1 size=8 offset=0: + type(dt20) :: y2 !CHECK: y2 size=4 offset=8: + real(c_double) :: y3 !CHECK: y3 size=8 offset=12: + end type + + type, bind(c) :: dt30 + character(c_char) :: x1 + character(c_char) :: x2 + end type + type, bind(c) :: dt31 + integer(c_long) :: y1 !CHECK: y1 size=8 offset=0: + type(dt30) :: y2 !CHECK: y2 size=2 offset=8: + real(c_double) :: y3 !CHECK: y3 size=8 offset=12: + end type + + type, bind(c) :: dt40 + integer(c_short) :: x1 + real(c_double) :: x2 + end type + type, bind(c) :: dt41 + real(c_double) :: y1 !CHECK: y1 size=8 offset=0: + type(dt40) :: y2 !CHECK: y2 size=12 offset=8: + real(c_double) :: y3 !CHECK: y3 size=8 offset=20: + end type + + type, bind(c) :: dt50 + integer(c_short) :: x1 + complex(c_double_complex) :: x2 + end type + type, bind(c) :: dt51 + real(c_double) :: y1 !CHECK: y1 size=8 offset=0: + type(dt50) :: y2 !CHECK: y2 size=20 offset=8: + complex(c_double_complex) :: y3 !CHECK: y3 size=16 offset=28: + end type +end subroutine From 073be157171dce44ea6ccb8d8bcedf1648d1bc56 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 13 Jan 2025 15:56:32 +0000 Subject: [PATCH 075/102] [gn build] Port b5ba4f06db2e --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 83c0811b6814a..74e81f2e98084 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1462,7 +1462,6 @@ if (current_toolchain == default_toolchain) { "__locale_dir/locale_base_api.h", "__locale_dir/locale_base_api/android.h", "__locale_dir/locale_base_api/bsd_locale_fallbacks.h", - "__locale_dir/locale_base_api/fuchsia.h", "__locale_dir/locale_base_api/ibm.h", "__locale_dir/locale_base_api/musl.h", "__locale_dir/locale_base_api/openbsd.h", @@ -1470,6 +1469,9 @@ if (current_toolchain == default_toolchain) { "__locale_dir/support/apple.h", "__locale_dir/support/bsd_like.h", "__locale_dir/support/freebsd.h", + "__locale_dir/support/fuchsia.h", + "__locale_dir/support/no_locale/characters.h", + "__locale_dir/support/no_locale/strtonum.h", "__locale_dir/support/windows.h", "__math/abs.h", "__math/copysign.h", From 139b1b4043708e7d6a597984f7c63d009e73d835 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 13 Jan 2025 15:56:33 +0000 Subject: [PATCH 076/102] [gn build] Port cedb44af53f1 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 74e81f2e98084..639095b698c6f 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1512,7 +1512,6 @@ if (current_toolchain == default_toolchain) { "__memory/array_cookie.h", "__memory/assume_aligned.h", "__memory/auto_ptr.h", - "__memory/builtin_new_allocator.h", "__memory/compressed_pair.h", "__memory/concepts.h", "__memory/construct_at.h", @@ -1845,6 +1844,7 @@ if (current_toolchain == default_toolchain) { "__utility/cmp.h", "__utility/convert_to_integral.h", "__utility/declval.h", + "__utility/element_count.h", "__utility/empty.h", "__utility/exception_guard.h", "__utility/exchange.h", From c76b878019bd92a1b7e850b826905b9bae1afed6 Mon Sep 17 00:00:00 2001 From: Philipp Schilk Date: Mon, 13 Jan 2025 17:05:26 +0100 Subject: [PATCH 077/102] [MLIR][TableGen] Use arg index in InferredResultType constructor. (#122717) Trying to constrain two results to be of the same type using `AllTypesMatch` would cause `mlir-tablgen` to crash on this assertion[1]. Example: ```tblgen def OpL5 : NS_Op<"op_with_same_but_unconstraint_results", [AllTypesMatch<["result_a", "result_b"]>]> { let results = (outs AnyType:$result_a, AnyType:$result_b); } ``` This is because there was a small bug when constructing the `inferences` graph from these constraints: The sources should be specified by the combined arg/result index (in other words, with results negative) not with the result index. [1] https://github.com/llvm/llvm-project/blob/99612a3a18e0c40aac9c52b68e67b106f97ed4fa/mlir/lib/TableGen/Operator.cpp#L526 --- mlir/lib/TableGen/Operator.cpp | 4 ++-- mlir/test/mlir-tblgen/op-result.td | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp index c360c61afd27b..20a43ef15d09e 100644 --- a/mlir/lib/TableGen/Operator.cpp +++ b/mlir/lib/TableGen/Operator.cpp @@ -503,8 +503,8 @@ void Operator::populateTypeInferenceInfo( for (int otherResultIndex : resultIndices) { if (resultIndex == otherResultIndex) continue; - inference[resultIndex].sources.emplace_back(otherResultIndex, - "$_self"); + inference[resultIndex].sources.emplace_back( + InferredResultType::unmapResultIndex(otherResultIndex), "$_self"); } } } diff --git a/mlir/test/mlir-tblgen/op-result.td b/mlir/test/mlir-tblgen/op-result.td index 51f8b0671a328..f668d9a5a6644 100644 --- a/mlir/test/mlir-tblgen/op-result.td +++ b/mlir/test/mlir-tblgen/op-result.td @@ -180,6 +180,27 @@ def OpL4 : NS_Op<"two_inference_edges", [ // CHECK: inferredReturnTypes[1] = odsInferredType1 // CHECK: inferredReturnTypes[2] = odsInferredType2 +def OpL5 : NS_Op<"op_with_same_but_unconstraint_results", + [AllTypesMatch<["result_a", "result_b"]>]> { + let results = (outs AnyType:$result_a, AnyType:$result_b); +} + +// CHECK-NOT: LogicalResult OpL5::inferReturnTypes + +def OpL6 : NS_Op<"op_with_same_and_constraint_results", + [AllTypesMatch<["result_a", "result_b", "result_c"]>]> { + let results = (outs AnyType:$result_a, AnyType:$result_b, I32:$result_c); +} + +// CHECK-LABEL: LogicalResult OpL6::inferReturnTypes +// CHECK-NOT: } +// CHECK: odsInferredType0 = odsBuilder.getIntegerType(32); +// CHECK: odsInferredType1 = odsBuilder.getIntegerType(32); +// CHECK: odsInferredType2 = odsBuilder.getIntegerType(32); +// CHECK: inferredReturnTypes[0] = odsInferredType0; +// CHECK: inferredReturnTypes[1] = odsInferredType1; +// CHECK: inferredReturnTypes[2] = odsInferredType2; + def OpM : NS_Op<"mix_diff_size_variadic_and_normal_results_op", [AttrSizedResultSegments]> { let results = (outs Variadic:$output1, AnyTensor:$output2, Optional:$output3); } From af4c227b6b91649d52fc0926d5bc1c6448e545f1 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Mon, 13 Jan 2025 20:10:31 +0400 Subject: [PATCH 078/102] [clang] Add test for CWG170 "Pointer-to-member conversions" (#121667) This patch adds test for [CWG170](https://cplusplus.github.io/CWG/issues/170.html). The resolution adds explicit undefined behavior, so I think the best we can do is to put the test into constexpr evaluator. Change to [expr.static.cast] is not tested, because it was a drive-by fix that removed an impossible case (I confirmed it using minutes). Minutes mention several times a comprehensive paper in this design space which no one seem to remember. I believe it's [P0149R0](https://wg21.link/p0149r0) "Generalised member pointers". --- clang/test/CXX/drs/cwg1xx.cpp | 20 ++++++++++++++++++++ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp index 98eb86c929009..15bcc20b7fa2a 100644 --- a/clang/test/CXX/drs/cwg1xx.cpp +++ b/clang/test/CXX/drs/cwg1xx.cpp @@ -1076,6 +1076,26 @@ namespace cwg169 { // cwg169: 3.4 }; } // namespace cwg169 +namespace cwg170 { // cwg170: 3.1 +#if __cplusplus >= 201103L +struct A {}; +struct B : A { int i; }; +struct C : A {}; +struct D : C {}; + +constexpr int f(int A::*) { return 0; } +constexpr int g(int C::*) { return 0; } +constexpr int h(int D::*) { return 0; } + +constexpr auto p = static_cast(&B::i); +constexpr auto q = f(p); +constexpr auto r = g(p); +// since-cxx11-error@-1 {{constexpr variable 'r' must be initialized by a constant expression}} +constexpr auto s = h(p); +// since-cxx11-error@-1 {{constexpr variable 's' must be initialized by a constant expression}} +#endif +} // namespace cwg170 + namespace { // cwg171: 3.4 int cwg171a; } diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index f2716f1e4c653..564502c1f3e92 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -1065,7 +1065,7 @@

C++ defect report implementation status

170 DRWP Pointer-to-member conversions - Unknown + Clang 3.1 171 From 15dde26afbbafc0ab3cfe441ce94cafb2a6905fe Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 13 Jan 2025 16:20:00 +0000 Subject: [PATCH 079/102] IR: introduce ICmpInst::isImpliedByMatchingCmp (#122597) Create an abstraction over isImplied{True,False}ByMatchingCmp to faithfully communicate the result of both functions, cleaning up code in callsites. While at it, fix a bug in the implied-false version of the function, which was inadvertedenly dropping samesign information. --- llvm/include/llvm/IR/Instructions.h | 13 ++---- llvm/include/llvm/SandboxIR/Instruction.h | 10 ++--- llvm/lib/Analysis/ValueTracking.cpp | 19 ++------- llvm/lib/IR/Instructions.cpp | 41 +++++++++++-------- llvm/lib/Transforms/Scalar/NewGVN.cpp | 16 ++------ .../implied-condition-samesign.ll | 37 +++++++++++++++++ 6 files changed, 76 insertions(+), 60 deletions(-) diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 59eb504098837..9a41971b63373 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -1266,15 +1266,10 @@ class ICmpInst: public CmpInst { return getFlippedSignednessPredicate(getPredicate()); } - /// Determine if Pred1 implies Pred2 is true when two compares have matching - /// operands. - static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2); - - /// Determine if Pred1 implies Pred2 is false when two compares have matching - /// operands. - static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2); + /// Determine if Pred1 implies Pred2 is true, false, or if nothing can be + /// inferred about the implication, when two compares have matching operands. + static std::optional isImpliedByMatchingCmp(CmpPredicate Pred1, + CmpPredicate Pred2); void setSameSign(bool B = true) { SubclassOptionalData = (SubclassOptionalData & ~SameSign) | (B * SameSign); diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index d7c1eda81c006..34a7feb63bec4 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -2547,13 +2547,9 @@ class ICmpInst : public CmpInst { WRAP_STATIC_PREDICATE(isGE); WRAP_STATIC_PREDICATE(isLE); - static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2) { - return llvm::ICmpInst::isImpliedTrueByMatchingCmp(Pred1, Pred2); - } - static bool isImpliedFalseByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2) { - return llvm::ICmpInst::isImpliedFalseByMatchingCmp(Pred1, Pred2); + static std::optional isImpliedByMatchingCmp(CmpPredicate Pred1, + CmpPredicate Pred2) { + return llvm::ICmpInst::isImpliedByMatchingCmp(Pred1, Pred2); } static auto predicates() { return llvm::ICmpInst::predicates(); } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0e50fc60ce792..d03e6f5a5754d 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -9384,19 +9384,6 @@ isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS, } } -/// Return true if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is true. -/// Return false if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is false. -/// Otherwise, return std::nullopt if we can't infer anything. -static std::optional isImpliedCondMatchingOperands(CmpPredicate LPred, - CmpPredicate RPred) { - if (ICmpInst::isImpliedTrueByMatchingCmp(LPred, RPred)) - return true; - if (ICmpInst::isImpliedFalseByMatchingCmp(LPred, RPred)) - return false; - - return std::nullopt; -} - /// Return true if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is true. /// Return false if "icmp LPred X, LCR" implies "icmp RPred X, RCR" is false. /// Otherwise, return std::nullopt if we can't infer anything. @@ -9489,7 +9476,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, // Can we infer anything when the two compares have matching operands? if (L0 == R0 && L1 == R1) - return isImpliedCondMatchingOperands(LPred, RPred); + return ICmpInst::isImpliedByMatchingCmp(LPred, RPred); // It only really makes sense in the context of signed comparison for "X - Y // must be positive if X >= Y and no overflow". @@ -9499,7 +9486,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SGE)) && match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) { if (match(R1, m_NonPositive()) && - isImpliedCondMatchingOperands(LPred, RPred) == false) + ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == false) return false; } @@ -9509,7 +9496,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0, CmpPredicate::getMatching(LPred, ICmpInst::ICMP_SLE)) && match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) { if (match(R1, m_NonNegative()) && - isImpliedCondMatchingOperands(LPred, RPred) == true) + ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == true) return true; } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 49c148bb68a4d..b8b2c1d7f9a85 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -3886,8 +3886,7 @@ bool CmpInst::isFalseWhenEqual(Predicate predicate) { } } -bool ICmpInst::isImpliedTrueByMatchingCmp(CmpPredicate Pred1, - CmpPredicate Pred2) { +static bool isImpliedTrueByMatchingCmp(CmpPredicate Pred1, CmpPredicate Pred2) { // If the predicates match, then we know the first condition implies the // second is true. if (CmpPredicate::getMatching(Pred1, Pred2)) @@ -3901,25 +3900,35 @@ bool ICmpInst::isImpliedTrueByMatchingCmp(CmpPredicate Pred1, switch (Pred1) { default: break; - case ICMP_EQ: + case CmpInst::ICMP_EQ: // A == B implies A >=u B, A <=u B, A >=s B, and A <=s B are true. - return Pred2 == ICMP_UGE || Pred2 == ICMP_ULE || Pred2 == ICMP_SGE || - Pred2 == ICMP_SLE; - case ICMP_UGT: // A >u B implies A != B and A >=u B are true. - return Pred2 == ICMP_NE || Pred2 == ICMP_UGE; - case ICMP_ULT: // A s B implies A != B and A >=s B are true. - return Pred2 == ICMP_NE || Pred2 == ICMP_SGE; - case ICMP_SLT: // A u B implies A != B and A >=u B are true. + return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_UGE; + case CmpInst::ICMP_ULT: // A s B implies A != B and A >=s B are true. + return Pred2 == CmpInst::ICMP_NE || Pred2 == CmpInst::ICMP_SGE; + case CmpInst::ICMP_SLT: // A ICmpInst::isImpliedByMatchingCmp(CmpPredicate Pred1, + CmpPredicate Pred2) { + if (isImpliedTrueByMatchingCmp(Pred1, Pred2)) + return true; + if (isImpliedFalseByMatchingCmp(Pred1, Pred2)) + return false; + return std::nullopt; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 3812e99508f73..b5ce860d73523 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -1964,18 +1964,10 @@ NewGVN::ExprResult NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { if (PBranch->TrueEdge) { // If we know the previous predicate is true and we are in the true // edge then we may be implied true or false. - if (ICmpInst::isImpliedTrueByMatchingCmp(BranchPredicate, - OurPredicate)) { - return ExprResult::some( - createConstantExpression(ConstantInt::getTrue(CI->getType())), - PI); - } - - if (ICmpInst::isImpliedFalseByMatchingCmp(BranchPredicate, - OurPredicate)) { - return ExprResult::some( - createConstantExpression(ConstantInt::getFalse(CI->getType())), - PI); + if (auto R = ICmpInst::isImpliedByMatchingCmp(BranchPredicate, + OurPredicate)) { + auto *C = ConstantInt::getBool(CI->getType(), *R); + return ExprResult::some(createConstantExpression(C), PI); } } else { // Just handle the ne and eq cases, where if we have the same diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll index 35cfadaa2965a..0e6db403512ae 100644 --- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll +++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll @@ -126,6 +126,19 @@ define i1 @sgt_implies_ge_via_assume(i32 %i, i32 %j) { ret i1 %i.ge.j } +define i1 @sgt_implies_false_le_via_assume(i32 %i, i32 %j) { +; CHECK-LABEL: define i1 @sgt_implies_false_le_via_assume( +; CHECK-SAME: i32 [[I:%.*]], i32 [[J:%.*]]) { +; CHECK-NEXT: [[I_SGT_J:%.*]] = icmp sgt i32 [[I]], [[J]] +; CHECK-NEXT: call void @llvm.assume(i1 [[I_SGT_J]]) +; CHECK-NEXT: ret i1 false +; + %i.sgt.j = icmp sgt i32 %i, %j + call void @llvm.assume(i1 %i.sgt.j) + %i.le.j = icmp samesign ule i32 %i, %j + ret i1 %i.le.j +} + define i32 @gt_implies_sge_dominating(i32 %a, i32 %len) { ; CHECK-LABEL: define i32 @gt_implies_sge_dominating( ; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) { @@ -150,6 +163,30 @@ end: ret i32 -1 } +define i32 @gt_implies_false_sle_dominating(i32 %a, i32 %len) { +; CHECK-LABEL: define i32 @gt_implies_false_sle_dominating( +; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_GT_LEN:%.*]] = icmp samesign ugt i32 [[A]], [[LEN]] +; CHECK-NEXT: br i1 [[A_GT_LEN]], label %[[TAKEN:.*]], label %[[END:.*]] +; CHECK: [[TAKEN]]: +; CHECK-NEXT: ret i32 0 +; CHECK: [[END]]: +; CHECK-NEXT: ret i32 -1 +; +entry: + %a.gt.len = icmp samesign ugt i32 %a, %len + br i1 %a.gt.len, label %taken, label %end + +taken: + %a.sle.len = icmp sle i32 %a, %len + %res = select i1 %a.sle.len, i32 30, i32 0 + ret i32 %res + +end: + ret i32 -1 +} + define i32 @gt_implies_sge_dominating_cr(i32 %a, i32 %len) { ; CHECK-LABEL: define i32 @gt_implies_sge_dominating_cr( ; CHECK-SAME: i32 [[A:%.*]], i32 [[LEN:%.*]]) { From 0164480b4df13e09691fc273f0ae5ecd9416e6bc Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 13 Jan 2025 16:31:01 +0000 Subject: [PATCH 080/102] [llvm][Docs] Add new LLDB Python guidance to release notes (#122719) As decided in https://discourse.llvm.org/t/rfc-lets-document-and-enforce-a-minimum-python-version-for-lldb/82731 and implemented by https://github.com/llvm/llvm-project/pull/114807. --- llvm/docs/ReleaseNotes.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index d1032138a9db0..a9d9e5fc7ace4 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -379,6 +379,10 @@ Changes to the LLVM tools Changes to LLDB --------------------------------- +* It is now recommended that LLDB be built with Python >= 3.8, but no changes + have been made to the supported Python versions. The next release, LLDB 21, + will require Python >= 3.8. + * LLDB now supports inline diagnostics for the expression evaluator and command line parser. Old: From 86f6a941a44d512e1be313cc8429e659cbea5e8a Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Mon, 13 Jan 2025 16:34:33 +0000 Subject: [PATCH 081/102] [Clang][LLVM][AArch64]Add new feature SSVE-BitPerm (#121947) The 20204-12 ISA update release adds a new feature: FEAT_SSVE_BitPerm, which allows the sve-bitperm instructions to run in streaming mode. It also removes the requirement of FEAT_SVE2 for FEAT_SVE_BitPerm. The sve2-bitperm feature is now an alias for sve-bitperm and sve2. A new feature flag sve-bitperm is added to reflect the change that the instructions under FEAT_SVE_BitPerm are supported if: on non streaming mode with FEAT_SVE2 and FEAT_SVE_BitPerm or in streaming mode with FEAT_SME and FEAT_SSVE_BitPerm --- clang/include/clang/Basic/arm_sve.td | 2 +- clang/lib/Basic/Targets/AArch64.cpp | 10 +- clang/lib/Basic/Targets/AArch64.h | 2 +- clang/test/CodeGen/AArch64/fmv-dependencies.c | 2 +- .../AArch64/sve2-intrinsics/acle_sve2_bdep.c | 8 +- .../AArch64/sve2-intrinsics/acle_sve2_bext.c | 8 +- .../AArch64/sve2-intrinsics/acle_sve2_bgrp.c | 8 +- clang/test/CodeGen/AArch64/targetattr.c | 2 +- .../Driver/aarch64-implied-sme-features.c | 5 +- .../Driver/aarch64-implied-sve-features.c | 14 ++- .../print-supported-extensions-aarch64.c | 4 +- .../Preprocessor/aarch64-target-features.c | 7 +- .../acle_sve2_aes_bitperm_sha3_sm4.cpp | 96 +++++++++---------- llvm/lib/Target/AArch64/AArch64.td | 4 +- llvm/lib/Target/AArch64/AArch64Features.td | 11 ++- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 8 +- llvm/lib/Target/AArch64/AArch64Processors.td | 36 +++---- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 4 +- .../AArch64/AsmParser/AArch64AsmParser.cpp | 6 +- llvm/lib/TargetParser/AArch64TargetParser.cpp | 5 + llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s | 2 +- llvm/test/MC/AArch64/SVE2/bdep.s | 18 ++-- llvm/test/MC/AArch64/SVE2/bext.s | 18 ++-- llvm/test/MC/AArch64/SVE2/bgrp.s | 18 ++-- .../MC/AArch64/SVE2/directive-arch-negative.s | 12 ++- llvm/test/MC/AArch64/SVE2/directive-arch.s | 6 +- .../SVE2/directive-arch_extension-negative.s | 8 +- .../AArch64/SVE2/directive-arch_extension.s | 2 +- .../MC/AArch64/SVE2/directive-cpu-negative.s | 12 ++- llvm/test/MC/AArch64/SVE2/directive-cpu.s | 6 +- .../TargetParser/TargetParserTest.cpp | 40 +++++--- 31 files changed, 232 insertions(+), 152 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 1c6bdb8cad2d1..47f1754aeb629 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1988,7 +1988,7 @@ def SVSM4E : SInst<"svsm4e[_{d}]", "ddd", "Ui", MergeNone, "aarch64_sve_sm def SVSM4EKEY : SInst<"svsm4ekey[_{d}]", "ddd", "Ui", MergeNone, "aarch64_sve_sm4ekey", [IsOverloadNone]>; } -let SVETargetGuard = "sve2-bitperm", SMETargetGuard = InvalidMode in { +let SVETargetGuard = "sve2,sve-bitperm", SMETargetGuard = InvalidMode in { def SVBDEP : SInst<"svbdep[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">; def SVBDEP_N : SInst<"svbdep[_n_{d}]", "dda", "UcUsUiUl", MergeNone, "aarch64_sve_bdep_x">; def SVBEXT : SInst<"svbext[_{d}]", "ddd", "UcUsUiUl", MergeNone, "aarch64_sve_bext_x">; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 1bf58661d0efc..4e211deb9faba 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -485,7 +485,7 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2 && HasSVEAES) Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1"); - if (HasSVE2 && HasSVE2BitPerm) + if (HasSVE2 && HasSVEBitPerm) Builder.defineMacro("__ARM_FEATURE_SVE2_BITPERM", "1"); if (HasSVE2 && HasSVE2SHA3) @@ -769,7 +769,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("f64mm", FPU & SveMode && HasMatmulFP64) .Case("sve2", FPU & SveMode && HasSVE2) .Case("sve-aes", HasSVEAES) - .Case("sve2-bitperm", FPU & SveMode && HasSVE2BitPerm) + .Case("sve-bitperm", FPU & HasSVEBitPerm) .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sve2p1", FPU & SveMode && HasSVE2p1) @@ -881,12 +881,10 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, } if (Feature == "+sve-b16b16") HasSVEB16B16 = true; - if (Feature == "+sve2-bitperm") { + if (Feature == "+sve-bitperm") { FPU |= NeonMode; - FPU |= SveMode; HasFullFP16 = true; - HasSVE2 = true; - HasSVE2BitPerm = true; + HasSVEBitPerm = true; } if (Feature == "+f32mm") { FPU |= NeonMode; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index cedf3286806ac..ecf80b23a508c 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -82,7 +82,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasSVE2SHA3 = false; bool HasSVE2SM4 = false; bool HasSVEB16B16 = false; - bool HasSVE2BitPerm = false; + bool HasSVEBitPerm = false; bool HasMatmulFP64 = false; bool HasMatmulFP32 = false; bool HasLSE = false; diff --git a/clang/test/CodeGen/AArch64/fmv-dependencies.c b/clang/test/CodeGen/AArch64/fmv-dependencies.c index 097b85e989d86..8dda3b647fcd0 100644 --- a/clang/test/CodeGen/AArch64/fmv-dependencies.c +++ b/clang/test/CodeGen/AArch64/fmv-dependencies.c @@ -192,7 +192,7 @@ int caller() { // CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" // CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" // CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+sve2-aes,+v8a" -// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a" +// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-bitperm,+sve2,+sve2-bitperm,+v8a" // CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a" // CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sm4,+sve,+sve2,+sve2-sm4,+v8a" // CHECK: attributes #[[wfxt]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt" diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c index d7c070d412a8f..d4681394a0508 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bdep.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c index 30b798e21f7a1..6d654b9353e7a 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bext.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c index 58445c6b810c7..a98d8e8a2b37c 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_bgrp.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-bitperm -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/AArch64/targetattr.c b/clang/test/CodeGen/AArch64/targetattr.c index ee7a07244ef9a..f8d5f9912c0d7 100644 --- a/clang/test/CodeGen/AArch64/targetattr.c +++ b/clang/test/CodeGen/AArch64/targetattr.c @@ -204,7 +204,7 @@ void applem4() {} // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } -// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } +// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve-bitperm,+sve2,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } // CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="cortex-a710" } // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+ete,+fp-armv8,+neon,+trbe,+v8a" } // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" } diff --git a/clang/test/Driver/aarch64-implied-sme-features.c b/clang/test/Driver/aarch64-implied-sme-features.c index 4d507c0e99dd9..23ec27ff1aaff 100644 --- a/clang/test/Driver/aarch64-implied-sme-features.c +++ b/clang/test/Driver/aarch64-implied-sme-features.c @@ -51,4 +51,7 @@ // SME-SUBFEATURE-CONFLICT-REV: "-target-feature" "+bf16"{{.*}} "-target-feature" "+sme" "-target-feature" "+sme-i16i64" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+ssve-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-AES -// SVE-AES: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-aes" "-target-feature" "+sve-aes" \ No newline at end of file +// SVE-AES: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-aes" "-target-feature" "+sve-aes" + ++// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+ssve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM ++// SVE-BITPERM: "-target-feature" "+sme" "-target-feature" "+sme2" "-target-feature" "+ssve-bitperm" "-target-feature" "+sve-bitperm" diff --git a/clang/test/Driver/aarch64-implied-sve-features.c b/clang/test/Driver/aarch64-implied-sve-features.c index e5f1e55345414..ecc1e9500b667 100644 --- a/clang/test/Driver/aarch64-implied-sve-features.c +++ b/clang/test/Driver/aarch64-implied-sve-features.c @@ -23,17 +23,24 @@ // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve+sve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-SVE2 // SVE-SVE2: "-target-feature" "+sve" "-target-feature" "+sve2" +// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM +// SVE-BITPERM: "-target-feature" "+sve-bitperm" + // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM -// SVE2-BITPERM: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" +// SVE2-BITPERM: "-target-feature" "+sve" "-target-feature" "+sve-bitperm" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=NOSVE2-BITPERM +// NOSVE2-BITPERM-NOT: "-target-feature" "+sve-bitperm" // NOSVE2-BITPERM-NOT: "-target-feature" "+sve2-bitperm" // NOSVE2-BITPERM-NOT: "-target-feature" "+sve2" // NOSVE2-BITPERM-NOT: "-target-feature" "+sve" // NOSVE2-BITPERM-NOT: sve2-bitperm" +// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-bitperm+nosve-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE-BITPERM-REVERT +// SVE-BITPERM-REVERT: "-target-feature" "-sve-bitperm" + // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-bitperm %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-BITPERM-REVERT -// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-bitperm" +// SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "-sve-bitperm" "-target-feature" "-sve2" "-target-feature" "-sve2-bitperm" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES-REVERT // SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "-sve-aes" "-target-feature" "+sve2" "-target-feature" "-sve2-aes" @@ -57,7 +64,7 @@ // SVE2-SM4: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sm4" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-bitperm+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-MIX -// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" +// SVE2-SUBFEATURE-MIX: "-target-feature" "+sve" "-target-feature" "+sve-bitperm" "-target-feature" "+sve2" "-target-feature" "+sve2-bitperm" // SVE2-SUBFEATURE-NOT: sve2-aes // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SUBFEATURE-CONFLICT @@ -72,6 +79,7 @@ // SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "+sve2-aes" // RUN: %clang --target=aarch64-linux-gnu -mcpu=neoverse-n2+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-MCPU-FEATURES +// SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve-bitperm" // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2-bitperm" // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2" // SVE-MCPU-FEATURES: "-target-feature" "+sve" diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c index 09d499548aa56..75aa1a3aeecdd 100644 --- a/clang/test/Driver/print-supported-extensions-aarch64.c +++ b/clang/test/Driver/print-supported-extensions-aarch64.c @@ -78,6 +78,7 @@ // CHECK-NEXT: predres2 FEAT_SPECRES2 Enable Speculation Restriction Instruction // CHECK-NEXT: ssbs FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit // CHECK-NEXT: ssve-aes FEAT_SSVE_AES Enable Armv9.6-A SVE AES support in streaming SVE mode +// CHECK-NEXT: ssve-bitperm FEAT_SSVE_BitPerm Enable Armv9.6-A SVE BitPerm support in streaming SVE mode // CHECK-NEXT: ssve-fp8dot2 FEAT_SSVE_FP8DOT2 Enable SVE2 FP8 2-way dot product instructions // CHECK-NEXT: ssve-fp8dot4 FEAT_SSVE_FP8DOT4 Enable SVE2 FP8 4-way dot product instructions // CHECK-NEXT: ssve-fp8fma FEAT_SSVE_FP8FMA Enable SVE2 FP8 multiply-add instructions @@ -86,10 +87,11 @@ // CHECK-NEXT: sve-aes2 FEAT_SVE_AES2 Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions // CHECK-NEXT: sve-b16b16 FEAT_SVE_B16B16 Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions // CHECK-NEXT: sve-bfscale FEAT_SVE_BFSCALE Enable Armv9.6-A SVE BFloat16 scaling instructions +// CHECK-NEXT: sve-bitperm FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions // CHECK-NEXT: sve-f16f32mm FEAT_SVE_F16F32MM Enable Armv9.6-A FP16 to FP32 Matrix Multiply // CHECK-NEXT: sve2 FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions // CHECK-NEXT: sve2-aes Shorthand for +sve2+sve-aes -// CHECK-NEXT: sve2-bitperm FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions +// CHECK-NEXT: sve2-bitperm Shorthand for +sve2+sve-bitperm // CHECK-NEXT: sve2-sha3 FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions // CHECK-NEXT: sve2-sm4 FEAT_SVE_SM4 Enable SM4 SVE2 instructions // CHECK-NEXT: sve2p1 FEAT_SVE2p1 Enable Scalable Vector Extension 2.1 instructions diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 86265f630296c..b10c55447d9af 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -246,7 +246,12 @@ // CHECK-SVE2SHA3: __ARM_FEATURE_SVE2_SHA3 1 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-sm4 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2SM4 %s // CHECK-SVE2SM4: __ARM_FEATURE_SVE2_SM4 1 -// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s +// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEBITPERM %s +// CHECK-SVEBITPERM: __ARM_FEATURE_SVE2_BITPERM 1 + +// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sve2-bitperm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s +// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sve-bitperm+sve2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2BITPERM %s +// CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2 1 // CHECK-SVE2BITPERM: __ARM_FEATURE_SVE2_BITPERM 1 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2p1 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p1 %s diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp index 93d4b00701693..985ea15ac2a4e 100644 --- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp +++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp @@ -26,61 +26,61 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64) // expected-error@+2 {{'svaesmc_u8' needs target feature sve,sve2,sve-aes}} // overload-error@+1 {{'svaesmc' needs target feature sve,sve2,sve-aes}} SVE_ACLE_FUNC(svaesmc,_u8,,)(svundef_u8()); - // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svbdep_n_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u8,,)(svundef_u8(), u8); - // expected-error@+2 {{'svbext_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svbext_n_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u8,,)(svundef_u8(), u8); - // expected-error@+2 {{'svbgrp_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svbgrp_n_u8' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u8' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u8,,)(svundef_u8(), u8); - // expected-error@+2 {{'svbdep_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u16,,)(svundef_u16(), svundef_u16()); - // expected-error@+2 {{'svbdep_n_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u16,,)(svundef_u16(), u16); - // expected-error@+2 {{'svbext_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u16,,)(svundef_u16(), svundef_u16()); - // expected-error@+2 {{'svbext_n_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u16,,)(svundef_u16(), u16); - // expected-error@+2 {{'svbgrp_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u16,,)(svundef_u16(), svundef_u16()); - // expected-error@+2 {{'svbgrp_n_u16' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u16' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u16,,)(svundef_u16(), u16); - // expected-error@+2 {{'svbdep_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbdep_n_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u32,,)(svundef_u32(), u32); - // expected-error@+2 {{'svbext_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbext_n_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u32,,)(svundef_u32(), u32); - // expected-error@+2 {{'svbgrp_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbgrp_n_u32' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u32' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u32,,)(svundef_u32(), u32); // expected-error@+2 {{'svsm4e_u32' needs target feature sve,sve2-sm4}} // overload-error@+1 {{'svsm4e' needs target feature sve,sve2-sm4}} @@ -89,23 +89,23 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64) // overload-error@+1 {{'svsm4ekey' needs target feature sve,sve2-sm4}} SVE_ACLE_FUNC(svsm4ekey,_u32,,)(svundef_u32(), svundef_u32()); - // expected-error@+2 {{'svbdep_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svbdep_n_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbdep_n_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbdep' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbdep,_n_u64,,)(svundef_u64(), u64); - // expected-error@+2 {{'svbext_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svbext_n_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbext' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbext_n_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbext' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbext,_n_u64,,)(svundef_u64(), u64); - // expected-error@+2 {{'svbgrp_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2-bitperm}} - // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} + // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2,sve-bitperm}} + // overload-error@+1 {{'svbgrp' needs target feature sve,sve2,sve-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u64,,)(svundef_u64(), u64); // expected-error@+2 {{'svpmullb_pair_u64' needs target feature sve,sve2,sve-aes}} // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2,sve-aes}} diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index e3dd334e7b098..20e77b3be2a27 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -63,7 +63,7 @@ def SVE2p1Unsupported : AArch64Unsupported; def SVE2Unsupported : AArch64Unsupported { let F = !listconcat([HasSVE2, HasSVE2orSME, HasSVE2orSME2, HasSSVE_FP8FMA, HasSMEF8F16, - HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm, + HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVEBitPerm, HasSVEB16B16], SVE2p1Unsupported.F); } @@ -74,7 +74,7 @@ def SVEUnsupported : AArch64Unsupported { } let F = [HasSME2p2, HasSVE2p2orSME2p2, HasNonStreamingSVEorSME2p2, - HasNonStreamingSVE2p2orSME2p2] in + HasNonStreamingSVE2p2orSME2p2, HasNonStreamingSVE2orSSVE_BitPerm] in def SME2p2Unsupported : AArch64Unsupported; def SME2p1Unsupported : AArch64Unsupported { diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 5a233e2d870b3..76405750db640 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -376,9 +376,11 @@ def FeatureSVE2SM4 : ExtensionWithMArch<"sve2-sm4", "SVE2SM4", "FEAT_SVE_SM4", def FeatureSVE2SHA3 : ExtensionWithMArch<"sve2-sha3", "SVE2SHA3", "FEAT_SVE_SHA3", "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>; -def FeatureSVE2BitPerm : ExtensionWithMArch<"sve2-bitperm", "SVE2BitPerm", - "FEAT_SVE_BitPerm", - "Enable bit permutation SVE2 instructions", [FeatureSVE2]>; +def FeatureSVEBitPerm : ExtensionWithMArch<"sve-bitperm", "SVEBitPerm", + "FEAT_SVE_BitPerm", "Enable bit permutation SVE2 instructions">; + +def FeatureAliasSVE2BitPerm : ExtensionWithMArch<"sve2-bitperm", "SVE2BitPerm", + "", "Shorthand for +sve2+sve-bitperm", [FeatureSVE2, FeatureSVEBitPerm]>; def FeatureTRBE : Extension<"trbe", "TRBE", "FEAT_TRBE", "Enable Trace Buffer Extension">; @@ -565,6 +567,9 @@ def FeaturePCDPHINT: ExtensionWithMArch<"pcdphint", "PCDPHINT", "FEAT_PCDPHINT", def FeaturePoPS: ExtensionWithMArch<"pops", "PoPS", "FEAT_PoPS", "Enable Armv9.6-A Point Of Physical Storage (PoPS) DC instructions">; +def FeatureSSVE_BitPerm : ExtensionWithMArch<"ssve-bitperm", "SSVE_BitPerm", "FEAT_SSVE_BitPerm", + "Enable Armv9.6-A SVE BitPerm support in streaming SVE mode", [FeatureSME2, FeatureSVEBitPerm]>; + //===----------------------------------------------------------------------===// // Other Features //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c6f5cdcd1d5fe..948701f897855 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -155,8 +155,8 @@ def HasSVE2SM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">; def HasSVE2SHA3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">, AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">; -def HasSVE2BitPerm : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">, - AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; +def HasSVEBitPerm : Predicate<"Subtarget->hasSVEBitPerm()">, + AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">; def HasSMEandIsNonStreamingSafe : Predicate<"Subtarget->hasSME()">, AssemblerPredicateWithAll<(all_of FeatureSME), "sme">; @@ -286,6 +286,10 @@ def HasNonStreamingSVE2p2orSME2p2 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2), "sme2p2 or sve2p2">; +def HasNonStreamingSVE2orSSVE_BitPerm + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">; // A subset of NEON instructions are legal in Streaming SVE execution mode, // so don't need the additional check for 'isNeonAvailable'. diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 2da67126a1753..364ab0d82bf88 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -694,7 +694,7 @@ def ProcessorFeatures { FeatureLSE, FeatureRAS, FeatureRDM]; list A510 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, @@ -702,7 +702,7 @@ def ProcessorFeatures { FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM]; list A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, @@ -711,7 +711,7 @@ def ProcessorFeatures { FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureDotProd]; list A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, @@ -747,14 +747,14 @@ def ProcessorFeatures { list A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureCCIDX, FeatureSSBS, FeatureETE, FeatureMTE, FeatureFP16FML, - FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8, + FeatureSVEBitPerm, FeatureBF16, FeatureMatMulInt8, FeaturePAuth, FeatureFlagM, FeatureSB, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM]; list A715 = [HasV9_0aOps, FeatureNEON, FeatureMTE, FeatureCCIDX, FeatureFP16FML, FeatureSVE, FeatureTRBE, - FeatureSVE2BitPerm, FeatureBF16, FeatureETE, + FeatureSVEBitPerm, FeatureBF16, FeatureETE, FeaturePerfMon, FeatureMatMulInt8, FeatureSPE, FeatureSB, FeatureSSBS, FeatureFullFP16, FeaturePAuth, FeaturePredRes, FeatureFlagM, FeatureSVE2, FeatureComplxNum, FeatureCRC, @@ -763,7 +763,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureRDM]; list A720 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, - FeatureTRBE, FeatureSVE2BitPerm, FeatureETE, + FeatureTRBE, FeatureSVEBitPerm, FeatureETE, FeaturePerfMon, FeatureSPE, FeatureSPE_EEF, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, @@ -772,7 +772,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureRDM]; list A720AE = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, - FeatureTRBE, FeatureSVE2BitPerm, FeatureETE, + FeatureTRBE, FeatureSVEBitPerm, FeatureETE, FeaturePerfMon, FeatureSPE, FeatureSPE_EEF, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, @@ -782,7 +782,7 @@ def ProcessorFeatures { list A725 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, FeatureETE, FeaturePerfMon, FeatureSPE, - FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE, + FeatureSVEBitPerm, FeatureSPE_EEF, FeatureTRBE, FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, @@ -814,7 +814,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM]; list X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, - FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureMTE, FeatureETE, FeatureSVEBitPerm, FeatureFP16FML, FeatureCCIDX, FeaturePAuth, FeatureSSBS, FeatureSB, FeatureSVE, FeatureSVE2, FeatureFlagM, @@ -823,7 +823,7 @@ def ProcessorFeatures { list X3 = [HasV9_0aOps, FeatureSVE, FeatureNEON, FeaturePerfMon, FeatureETE, FeatureTRBE, FeatureSPE, FeatureBF16, FeatureMatMulInt8, - FeatureMTE, FeatureSVE2BitPerm, FeatureFullFP16, + FeatureMTE, FeatureSVEBitPerm, FeatureFullFP16, FeatureFP16FML, FeatureCCIDX, FeatureSB, FeaturePAuth, FeaturePredRes, FeatureFlagM, FeatureSSBS, @@ -831,7 +831,7 @@ def ProcessorFeatures { FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, FeatureDotProd]; list X4 = [HasV9_2aOps, FeaturePerfMon, FeatureETE, FeatureTRBE, - FeatureSPE, FeatureMTE, FeatureSVE2BitPerm, + FeatureSPE, FeatureMTE, FeatureSVEBitPerm, FeatureFP16FML, FeatureSPE_EEF, FeatureCCIDX, FeatureSB, FeatureSSBS, FeaturePAuth, FeatureFlagM, FeaturePredRes, @@ -841,7 +841,7 @@ def ProcessorFeatures { list X925 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, FeatureCCIDX, FeatureETE, FeaturePerfMon, FeatureSPE, - FeatureSVE2BitPerm, FeatureSPE_EEF, FeatureTRBE, + FeatureSVEBitPerm, FeatureSPE_EEF, FeatureTRBE, FeatureFlagM, FeaturePredRes, FeatureSB, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, @@ -855,7 +855,7 @@ def ProcessorFeatures { FeatureFPAC, FeatureFP16FML, FeatureRandGen, FeatureSSBS, FeatureLS64, FeatureCLRBHB, FeatureSPECRES2, FeatureSVEAES, FeatureSVE2SM4, - FeatureSVE2SHA3, FeatureSVE2BitPerm, FeatureETE, + FeatureSVE2SHA3, FeatureSVE2, FeatureSVEBitPerm, FeatureETE, FeatureMEC, FeatureFP8DOT2]; list Carmel = [HasV8_2aOps, FeatureNEON, FeatureSHA2, FeatureAES, FeatureFullFP16, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM, @@ -942,7 +942,7 @@ def ProcessorFeatures { FeaturePerfMon, FeatureCRC, FeatureLSE, FeatureRAS, FeatureRDM]; list NeoverseN2 = [HasV9_0aOps, FeatureBF16, FeatureETE, FeatureFP16FML, FeatureMatMulInt8, FeatureMTE, FeatureSVE2, - FeatureSVE2BitPerm, FeatureTRBE, + FeatureSVEBitPerm, FeatureTRBE, FeaturePerfMon, FeatureCCIDX, FeatureDotProd, FeatureFullFP16, FeatureSB, FeatureSSBS, FeatureSVE, @@ -951,7 +951,7 @@ def ProcessorFeatures { list NeoverseN3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML, FeatureFullFP16, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, FeatureSPE_EEF, - FeatureSVE2BitPerm, + FeatureSVEBitPerm, FeatureCCIDX, FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, @@ -978,7 +978,7 @@ def ProcessorFeatures { FeatureRCPC, FeatureRDM]; list NeoverseV2 = [HasV9_0aOps, FeatureBF16, FeatureSPE, FeaturePerfMon, FeatureETE, FeatureMatMulInt8, - FeatureNEON, FeatureSVE2BitPerm, FeatureFP16FML, + FeatureNEON, FeatureSVEBitPerm, FeatureFP16FML, FeatureMTE, FeatureRandGen, FeatureCCIDX, FeatureSVE, FeatureSVE2, FeatureSSBS, FeatureFullFP16, FeatureDotProd, @@ -988,7 +988,7 @@ def ProcessorFeatures { FeatureFullFP16, FeatureLS64, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, FeatureCCIDX, - FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE, + FeatureSPE_EEF, FeatureSVEBitPerm, FeatureBRBE, FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8, FeatureMatMulInt8, FeatureJS, FeatureLSE, @@ -996,7 +996,7 @@ def ProcessorFeatures { list NeoverseV3AE = [HasV9_2aOps, FeatureETE, FeatureFP16FML, FeatureFullFP16, FeatureLS64, FeatureMTE, FeaturePerfMon, FeatureRandGen, FeatureSPE, - FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE, + FeatureSPE_EEF, FeatureSVEBitPerm, FeatureBRBE, FeatureSSBS, FeatureSB, FeaturePredRes, FeaturePAuth, FeatureFlagM, FeatureCCIDX, FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 7dd6d49bf2022..22715c61126d1 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3946,12 +3946,12 @@ let Predicates = [HasSVE2SHA3] in { defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>; } // End HasSVE2SHA3 -let Predicates = [HasSVE2BitPerm] in { +let Predicates = [HasSVEBitPerm, HasNonStreamingSVE2orSSVE_BitPerm] in { // SVE2 bitwise permute defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>; defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>; defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>; -} // End HasSVE2BitPerm +} let Predicates = [HasSVEAES2, HasNonStreamingSVE2p1orSSVE_AES] in { // SVE_AES2 multi-vector instructions (x2) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index f44afd804c2bd..c37c57590f906 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -3755,7 +3755,10 @@ static const struct Extension { {"sve2-aes", {AArch64::FeatureAliasSVE2AES, AArch64::FeatureSVEAES}}, {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, {"sve2-sha3", {AArch64::FeatureSVE2SHA3}}, - {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}}, + {"sve-bitperm", {AArch64::FeatureSVEBitPerm}}, + {"sve2-bitperm", + {AArch64::FeatureAliasSVE2BitPerm, AArch64::FeatureSVEBitPerm, + AArch64::FeatureSVE2}}, {"sve2p1", {AArch64::FeatureSVE2p1}}, {"ls64", {AArch64::FeatureLS64}}, {"xs", {AArch64::FeatureXS}}, @@ -3827,6 +3830,7 @@ static const struct Extension { {"lsui", {AArch64::FeatureLSUI}}, {"occmo", {AArch64::FeatureOCCMO}}, {"pcdphint", {AArch64::FeaturePCDPHINT}}, + {"ssve-bitperm", {AArch64::FeatureSSVE_BitPerm}}, }; static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 7d0b8c333f72f..34ca03a47e0a4 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -276,6 +276,11 @@ void AArch64::ExtensionSet::disable(ArchExtKind E) { if (E == AEK_SVE2AES) disable(AEK_SVEAES); + if (E == AEK_SVE2BITPERM){ + disable(AEK_SVEBITPERM); + disable(AEK_SVE2); + } + if (!Enabled.test(E)) return; diff --git a/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s b/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s index 08a589e1f963f..9e40830882c87 100644 --- a/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s +++ b/llvm/test/MC/AArch64/SVE2/bdep-diagnostics.s @@ -1,4 +1,4 @@ -// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm 2>&1 < %s| FileCheck %s +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm 2>&1 < %s| FileCheck %s // ------------------------------------------------------------------------- // diff --git a/llvm/test/MC/AArch64/SVE2/bdep.s b/llvm/test/MC/AArch64/SVE2/bdep.s index a6ef95d9f2619..44c848d0b3b59 100644 --- a/llvm/test/MC/AArch64/SVE2/bdep.s +++ b/llvm/test/MC/AArch64/SVE2/bdep.s @@ -1,34 +1,36 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN bdep z0.b, z1.b, z31.b // CHECK-INST: bdep z0.b, z1.b, z31.b // CHECK-ENCODING: [0x20,0xb4,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 451fb420 bdep z0.h, z1.h, z31.h // CHECK-INST: bdep z0.h, z1.h, z31.h // CHECK-ENCODING: [0x20,0xb4,0x5f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 455fb420 bdep z0.s, z1.s, z31.s // CHECK-INST: bdep z0.s, z1.s, z31.s // CHECK-ENCODING: [0x20,0xb4,0x9f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 459fb420 bdep z0.d, z1.d, z31.d // CHECK-INST: bdep z0.d, z1.d, z31.d // CHECK-ENCODING: [0x20,0xb4,0xdf,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 45dfb420 diff --git a/llvm/test/MC/AArch64/SVE2/bext.s b/llvm/test/MC/AArch64/SVE2/bext.s index 43272205ab897..ea519c22cceb5 100644 --- a/llvm/test/MC/AArch64/SVE2/bext.s +++ b/llvm/test/MC/AArch64/SVE2/bext.s @@ -1,34 +1,36 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN bext z0.b, z1.b, z31.b // CHECK-INST: bext z0.b, z1.b, z31.b // CHECK-ENCODING: [0x20,0xb0,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 451fb020 bext z0.h, z1.h, z31.h // CHECK-INST: bext z0.h, z1.h, z31.h // CHECK-ENCODING: [0x20,0xb0,0x5f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 455fb020 bext z0.s, z1.s, z31.s // CHECK-INST: bext z0.s, z1.s, z31.s // CHECK-ENCODING: [0x20,0xb0,0x9f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 459fb020 bext z0.d, z1.d, z31.d // CHECK-INST: bext z0.d, z1.d, z31.d // CHECK-ENCODING: [0x20,0xb0,0xdf,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 45dfb020 diff --git a/llvm/test/MC/AArch64/SVE2/bgrp.s b/llvm/test/MC/AArch64/SVE2/bgrp.s index fb96946dc3c53..eb58d13511583 100644 --- a/llvm/test/MC/AArch64/SVE2/bgrp.s +++ b/llvm/test/MC/AArch64/SVE2/bgrp.s @@ -1,34 +1,36 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-bitperm < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2-bitperm - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-bitperm < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2,+sve-bitperm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-bitperm < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN bgrp z0.b, z1.b, z31.b // CHECK-INST: bgrp z0.b, z1.b, z31.b // CHECK-ENCODING: [0x20,0xb8,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 451fb820 bgrp z0.h, z1.h, z31.h // CHECK-INST: bgrp z0.h, z1.h, z31.h // CHECK-ENCODING: [0x20,0xb8,0x5f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 455fb820 bgrp z0.s, z1.s, z31.s // CHECK-INST: bgrp z0.s, z1.s, z31.s // CHECK-ENCODING: [0x20,0xb8,0x9f,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 459fb820 bgrp z0.d, z1.d, z31.d // CHECK-INST: bgrp z0.d, z1.d, z31.d // CHECK-ENCODING: [0x20,0xb8,0xdf,0x45] -// CHECK-ERROR: instruction requires: sve2-bitperm +// CHECK-ERROR: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-UNKNOWN: 45dfb820 diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s index 090d8af85825a..2cfce3b232ffc 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s @@ -29,10 +29,16 @@ rax1 z0.d, z0.d, z0.d // CHECK: error: instruction requires: sve2-sha3 // CHECK-NEXT: rax1 z0.d, z0.d, z0.d -.arch armv9-a+sve2-bitperm -.arch armv9-a+nosve2-bitperm +.arch armv9-a+ssve-bitperm +.arch armv9-a+nossve-bitperm bgrp z21.s, z10.s, z21.s -// CHECK: error: instruction requires: sve2-bitperm +// CHECK: error: instruction requires: sve-bitperm +// CHECK-NEXT: bgrp z21.s, z10.s, z21.s + +.arch armv9-a+sve2+sve-bitperm +.arch armv9-a+sve2+nosve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: error: instruction requires: sve-bitperm // CHECK-NEXT: bgrp z21.s, z10.s, z21.s .arch armv9-a+f8f16mm diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch.s b/llvm/test/MC/AArch64/SVE2/directive-arch.s index 1319a8a186971..203541a09ad37 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch.s @@ -20,7 +20,11 @@ sm4e z0.s, z0.s, z0.s rax1 z0.d, z0.d, z0.d // CHECK: rax1 z0.d, z0.d, z0.d -.arch armv9-a+sve2-bitperm +.arch armv9-a+sve2+sve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: bgrp z21.s, z10.s, z21.s + +.arch armv9-a+ssve-bitperm bgrp z21.s, z10.s, z21.s // CHECK: bgrp z21.s, z10.s, z21.s diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s index 2eb22ebf7428c..2fab61597576f 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s @@ -34,7 +34,13 @@ rax1 z0.d, z0.d, z0.d .arch_extension sve2-bitperm .arch_extension nosve2-bitperm bgrp z21.s, z10.s, z21.s -// CHECK: error: instruction requires: sve2-bitperm +// CHECK: error: instruction requires: sve2 or ssve-bitperm sve-bitperm +// CHECK-NEXT: bgrp z21.s, z10.s, z21.s + +.arch_extension sve2-bitperm +.arch_extension nosve2 +bgrp z21.s, z10.s, z21.s +// CHECK: error: instruction requires: sve2 or ssve-bitperm // CHECK-NEXT: bgrp z21.s, z10.s, z21.s .arch_extension f8f16mm diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s index ce56127ca93b1..e45e1f9881422 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s @@ -20,7 +20,7 @@ sm4e z0.s, z0.s, z0.s rax1 z0.d, z0.d, z0.d // CHECK: rax1 z0.d, z0.d, z0.d -.arch_extension sve2-bitperm +.arch_extension ssve-bitperm bgrp z21.s, z10.s, z21.s // CHECK: bgrp z21.s, z10.s, z21.s diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s index 461b9298df621..a50b990949424 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s @@ -29,10 +29,16 @@ rax1 z0.d, z0.d, z0.d // CHECK: error: instruction requires: sve2-sha3 // CHECK-NEXT: rax1 z0.d, z0.d, z0.d -.cpu generic+sve2-bitperm -.cpu generic+nosve2-bitperm +.cpu generic+sve2+sve-bitperm +.cpu generic+sve2+nosve-bitperm bgrp z21.s, z10.s, z21.s -// CHECK: error: instruction requires: sve2-bitperm +// CHECK: error: instruction requires: sve-bitperm +// CHECK-NEXT: bgrp z21.s, z10.s, z21.s + +.cpu generic+ssve-bitperm +.cpu generic+nossve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: error: instruction requires: sve2 or ssve-bitperm sve-bitperm // CHECK-NEXT: bgrp z21.s, z10.s, z21.s .cpu generic+sve2+f8f16mm diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu.s b/llvm/test/MC/AArch64/SVE2/directive-cpu.s index c54a3a9f272c3..0d873dd9b53f1 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-cpu.s +++ b/llvm/test/MC/AArch64/SVE2/directive-cpu.s @@ -20,7 +20,11 @@ sm4e z0.s, z0.s, z0.s rax1 z0.d, z0.d, z0.d // CHECK: rax1 z0.d, z0.d, z0.d -.cpu generic+sve2-bitperm +.cpu generic+sve2+sve-bitperm +bgrp z21.s, z10.s, z21.s +// CHECK: bgrp z21.s, z10.s, z21.s + +.cpu generic+ssve-bitperm bgrp z21.s, z10.s, z21.s // CHECK: bgrp z21.s, z10.s, z21.s diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index c03d3e8575d81..84d9af0ec48f2 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1343,7 +1343,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { AArch64::AEK_FPRCVT, AArch64::AEK_CMPBR, AArch64::AEK_LSUI, AArch64::AEK_OCCMO, AArch64::AEK_PCDPHINT, AArch64::AEK_POPS, - AArch64::AEK_SVEAES}; + AArch64::AEK_SVEAES, AArch64::AEK_SVEBITPERM, + AArch64::AEK_SSVE_BITPERM, + }; std::vector Features; @@ -1382,7 +1384,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { EXPECT_TRUE(llvm::is_contained(Features, "+sve2-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sm4")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sha3")); + EXPECT_TRUE(llvm::is_contained(Features, "+sve-bitperm")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-bitperm")); + EXPECT_TRUE(llvm::is_contained(Features, "+ssve-bitperm")); EXPECT_TRUE(llvm::is_contained(Features, "+sve-aes2")); EXPECT_TRUE(llvm::is_contained(Features, "+ssve-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2p1")); @@ -1554,6 +1558,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"sve2-sha3", "nosve2-sha3", "+sve2-sha3", "-sve2-sha3"}, {"sve2p1", "nosve2p1", "+sve2p1", "-sve2p1"}, {"sve2p2", "nosve2p2", "+sve2p2", "-sve2p2"}, + {"sve-bitperm", "nosve-bitperm", "+sve-bitperm", "-sve-bitperm"}, + {"ssve-bitperm", "nossve-bitperm", "+ssve-bitperm", "-ssve-bitperm"}, {"sve2-bitperm", "nosve2-bitperm", "+sve2-bitperm", "-sve2-bitperm"}, {"sve-aes2", "nosve-aes2", "+sve-aes2", "-sve-aes2"}, {"ssve-aes", "nossve-aes", "+ssve-aes", "-ssve-aes"}, @@ -1754,13 +1760,13 @@ AArch64ExtensionDependenciesBaseArchTestParams // Long dependency chains: sve2-bitperm -> sve2 -> sve -> fp16 -> fp {AArch64::ARMV8A, - {"nofp", "sve2-bitperm"}, - {"fp-armv8", "fullfp16", "sve", "sve2", "sve2-bitperm"}, + {"nofp", "sve2", "sve-bitperm"}, + {"fp-armv8", "fullfp16", "sve", "sve2", "sve-bitperm"}, {}}, {AArch64::ARMV8A, - {"sve2-bitperm", "nofp16"}, + {"sve2", "sve-bitperm", "nofp16"}, {"fp-armv8"}, - {"full-fp16", "sve", "sve2", "sve2-bitperm"}}, + {"full-fp16", "sve", "sve2", "sve-bitperm"}}, // Meaning of +crypto varies with base architecture. {AArch64::ARMV8A, {"crypto"}, {"aes", "sha2"}, {}}, @@ -1864,12 +1870,20 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV8A, {"sve2p1", "nosve2"}, {}, {"sve2", "sve2p1"}}, {AArch64::ARMV8A, {"nosve2", "sve2-bitperm"}, - {"sve2", "sve2-bitperm"}, + {"sve2", "sve-bitperm"}, {}}, {AArch64::ARMV8A, {"sve2-bitperm", "nosve2"}, - {}, - {"sve2", "sve2-bitperm"}}, + {"sve"}, + {"sve-bitperm", "sve2", "sve2-bitperm"}}, + {AArch64::ARMV8A, + {"ssve-bitperm", "nosve-bitperm"}, + {"sme"}, + {"ssve-bitperm", "sve-bitperm"}}, + {AArch64::ARMV8A, + {"nosve-bitperm", "ssve-bitperm"}, + {"sve-bitperm", "sve-bitperm"}, + {""}}, {AArch64::ARMV8A, {"nosve2", "sve2-sha3"}, {"sve2", "sve2-sha3"}, {}}, {AArch64::ARMV8A, {"sve2-sha3", "nosve2"}, {}, {"sve2", "sve2-sha3"}}, {AArch64::ARMV8A, {"nosve2", "sve2-sm4"}, {"sve2", "sve2-sm4"}, {}}, @@ -2040,10 +2054,10 @@ AArch64ExtensionDependenciesBaseCPUTestParams {}}, {"cortex-a520", {}, - {"v9.2a", "bf16", "crc", "dotprod", "flagm", "fp-armv8", - "fullfp16", "fp16fml", "i8mm", "lse", "mte", "pauth", - "perfmon", "predres", "ras", "rcpc", "rdm", "sb", - "neon", "ssbs", "sve", "sve2-bitperm", "sve2"}, + {"v9.2a", "bf16", "crc", "dotprod", "flagm", "fp-armv8", + "fullfp16", "fp16fml", "i8mm", "lse", "mte", "pauth", + "perfmon", "predres", "ras", "rcpc", "rdm", "sb", + "neon", "ssbs", "sve", "sve-bitperm", "sve2"}, {}}, // Negative modifiers @@ -2058,4 +2072,4 @@ INSTANTIATE_TEST_SUITE_P( AArch64ExtensionDependenciesBaseCPUTestFixture, ::testing::ValuesIn(AArch64ExtensionDependenciesCPUData)); -} // namespace \ No newline at end of file +} // namespace From 413aa5cebc23569ea864de040c764329b0cfdd2d Mon Sep 17 00:00:00 2001 From: William Moses Date: Mon, 13 Jan 2025 11:00:04 -0600 Subject: [PATCH 082/102] [MLIR][NVVM] Enable inlining of func's calling nvvm intrinsics (#122650) --- .../LLVMIR/Transforms/InlinerInterfaceImpl.h | 7 +++++++ mlir/include/mlir/InitAllDialects.h | 1 + .../LLVMIR/Transforms/InlinerInterfaceImpl.cpp | 7 +++++++ mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir | 16 ++++++++++++++++ 4 files changed, 31 insertions(+) create mode 100644 mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h index e99b0476a6b10..69cc2e32285b6 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h +++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h @@ -23,6 +23,13 @@ namespace LLVM { void registerInlinerInterface(DialectRegistry ®istry); } // namespace LLVM + +namespace NVVM { +/// Register the `NVVMInlinerInterface` implementation of +/// `DialectInlinerInterface` with the NVVM dialect. +void registerInlinerInterface(DialectRegistry ®istry); +} // namespace NVVM + } // namespace mlir #endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_INLINERINTERFACEIMPL_H diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index c102f811cce4b..0da82825c8287 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -167,6 +167,7 @@ inline void registerAllDialects(DialectRegistry ®istry) { gpu::registerBufferDeallocationOpInterfaceExternalModels(registry); gpu::registerValueBoundsOpInterfaceExternalModels(registry); LLVM::registerInlinerInterface(registry); + NVVM::registerInlinerInterface(registry); linalg::registerAllDialectInterfaceImplementations(registry); linalg::registerRuntimeVerifiableOpInterfaceExternalModels(registry); memref::registerAllocationOpInterfaceExternalModels(registry); diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp index b3bed5ab5f412..233cadebeec02 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" #include "mlir/Analysis/SliceWalk.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/IR/Matchers.h" #include "mlir/Interfaces/DataLayoutInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" @@ -815,3 +816,9 @@ void mlir::LLVM::registerInlinerInterface(DialectRegistry ®istry) { dialect->addInterfaces(); }); } + +void mlir::NVVM::registerInlinerInterface(DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) { + dialect->addInterfaces(); + }); +} diff --git a/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir b/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir new file mode 100644 index 0000000000000..6dc8ebb431508 --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/inlining-nvvm.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -inline -split-input-file | FileCheck %s + +// UNSUPPORTED: system-windows + +llvm.func @threadidx() -> i32 { + %tid = nvvm.read.ptx.sreg.tid.x : i32 + llvm.return %tid : i32 +} + +// CHECK-LABEL: func @caller +llvm.func @caller() -> i32 { + // CHECK-NOT: llvm.call @threadidx + // CHECK: nvvm.read.ptx.sreg.tid.x + %z = llvm.call @threadidx() : () -> (i32) + llvm.return %z : i32 +} From db5063c17437d5abe736035b40f2fbde82768a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Mon, 13 Jan 2025 18:18:12 +0100 Subject: [PATCH 083/102] [flang] Support discovering LLVM/Clang/MLIR without explicit *_DIR (#122639) Support discovering LLVM, Clang and MLIR via the standard CMake logic in addition to explicitly specified `LLVM_DIR`, etc. To prevent breaking anyone's workflow the way #120914 did, this change explicitly introduces two possible code paths based on variables provided: 1. If `LLVM_DIR`, etc. are defined, the current logic is used as-is. 2. If they are not defined, `find_package()` is called normally to discover the packages using the standard CMake logic, and the discovered paths are added --------- Co-authored-by: Slava Zakharin --- flang/CMakeLists.txt | 48 ++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 68947eaa9c9bd..b619553ef8302 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -91,28 +91,37 @@ if (FLANG_STANDALONE_BUILD) # If the user specifies a relative path to LLVM_DIR, the calls to include # LLVM modules fail. Append the absolute path to LLVM_DIR instead. - get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} - REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) - list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE}) + if (LLVM_DIR) + get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} + REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE}) + endif() # We need a pre-built/installed version of LLVM. find_package(LLVM REQUIRED HINTS "${LLVM_DIR_ABSOLUTE}") + if (NOT LLVM_DIR_ABSOLUTE) + # If the user did not specify a LLVM_DIR (and therefore LLVM_DIR_ABSOLUTE + # was not set), append the discovered path to CMAKE_MODULE_PATH. + list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR}) + endif() # Users might specify a path to CLANG_DIR that's: # * a full path, or # * a path relative to the path of this script. # Append the absolute path to CLANG_DIR so that find_package works in both # cases. - get_filename_component( - CLANG_DIR_ABSOLUTE - ${CLANG_DIR} - REALPATH - BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) - list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE}) - - # TODO: Remove when libclangDriver is lifted out of Clang - find_package(Clang REQUIRED PATHS "${CLANG_DIR_ABSOLUTE}" NO_DEFAULT_PATH) - if (NOT Clang_FOUND) - message(FATAL_ERROR "Failed to find Clang") + if (CLANG_DIR) + get_filename_component( + CLANG_DIR_ABSOLUTE + ${CLANG_DIR} + REALPATH + BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE}) + + # TODO: Remove when libclangDriver is lifted out of Clang + find_package(Clang REQUIRED PATHS "${CLANG_DIR_ABSOLUTE}" NO_DEFAULT_PATH) + else() + find_package(Clang REQUIRED) + list(APPEND CMAKE_MODULE_PATH ${Clang_DIR}) endif() # If LLVM links to zlib we need the imported targets so we can too. @@ -134,10 +143,15 @@ if (FLANG_STANDALONE_BUILD) include(TableGen) # If the user specifies a relative path to MLIR_DIR, the calls to include # MLIR modules fail. Append the absolute path to MLIR_DIR instead. - get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR} - REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) - list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE}) + if (MLIR_DIR) + get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR} + REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE}) + endif() find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR_ABSOLUTE}) + if (NOT MLIR_DIR_ABSOLUTE) + list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR}) + endif() # Use SYSTEM for the same reasons as for LLVM includes include_directories(SYSTEM ${MLIR_INCLUDE_DIRS}) include(AddMLIR) From fa28ba35b1a01506a0dcda3f9ab0258b302b7cda Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Mon, 13 Jan 2025 12:26:36 -0500 Subject: [PATCH 084/102] [AMDGPU][True16][CodeGen] true16 codegen pattern for v_pack_b32_f16 (#121988) true16 codegen pattern for v_pack_b32_f16 --- llvm/lib/Target/AMDGPU/SIInstructions.td | 3 + llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll | 9 +- llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll | 9 +- llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 59 ++- llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 23 +- llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 23 +- llvm/test/CodeGen/AMDGPU/v_pack.ll | 408 +++++++++++++++++++++ 7 files changed, 450 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 4325ab448e581..cdc1132579d8d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3390,6 +3390,9 @@ let SubtargetPredicate = isGFX9Plus in { let True16Predicate = NotHasTrue16BitInsts in def : PackB32Pat; +let True16Predicate = UseRealTrue16Insts in + def : PackB32Pat; + let True16Predicate = UseFakeTrue16Insts in def : PackB32Pat; } // End SubtargetPredicate = isGFX9Plus diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 84a3a3e88d238..32d8aa18d9713 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -160,14 +160,9 @@ define amdgpu_kernel void @ceil_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l -; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 9909cfd32b11f..f6a9fadb33865 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -161,14 +161,9 @@ define amdgpu_kernel void @floor_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l -; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 53c26cadbf75a..ff1c3da1d5fe5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -480,9 +480,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32: @@ -610,9 +609,7 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16: @@ -737,15 +734,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32: @@ -891,12 +886,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16: @@ -1036,24 +1028,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000 -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v6.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32: @@ -1238,20 +1227,14 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v6.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 245df6684384c..94b22b79f6632 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -237,14 +237,9 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -338,17 +333,13 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index bc1b102d33de1..2a2fd93bc2d0b 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -237,14 +237,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -338,17 +333,13 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 2eba67b06bae1..072151dd6f5a0 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s +; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s +; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -38,6 +42,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -87,6 +174,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_subrev_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_subrev_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -136,6 +306,78 @@ define amdgpu_kernel void @fptrunc( ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: fptrunc: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-GCN-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-GCN-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GCN-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: fptrunc: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s6, -1 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s10, s6 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s11, s7 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s8, s2 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s9, s3 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s4, s0 +; GFX11-GCN-REAL16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s5, s1 +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-GCN-REAL16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: fptrunc: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, s3 +; GFX11-GISEL-REAL16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-REAL16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-REAL16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { %a.val = load <2 x float>, ptr addrspace(1) %a @@ -178,6 +420,89 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fabs: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, |v1|, |v0| +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fabs: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, |v1|, |v0| +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32.fabs: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h| +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fabs: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h| +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -229,6 +554,89 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fneg: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, -v1, -v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fneg: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, -v1, -v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32.fneg: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fneg: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext From 13a4fbf6248735aa4ff0760329dc30bdcd04512d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 13 Jan 2025 09:02:56 -0800 Subject: [PATCH 085/102] [SLP]Check for div/rem instructions before extending with poisons Need to check if the instructions can be safely extended with poison before actually doing this to avoid incorrect transformations. Fixes #122691 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 9 ++- .../X86/div-possibly-extended-with-poisons.ll | 71 +++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index df46c69ff3ab4..4b0ed5b30179b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8091,6 +8091,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, NonUniqueValueVL.append( PWSz - UniqueValues.size(), PoisonValue::get(UniqueValues.front()->getType())); + // Check that extended with poisons operations are still valid for + // vectorization (div/rem are not allowed). + if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) { + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return false; + } VL = NonUniqueValueVL; } return true; @@ -17818,7 +17825,7 @@ bool BoUpSLP::collectValuesToDemote( }; if (E.isGather() || !Visited.insert(&E).second || any_of(E.Scalars, [&](Value *V) { - return all_of(V->users(), [&](User *U) { + return !isa(V) && all_of(V->users(), [&](User *U) { return isa(U) && !getTreeEntry(U); }); })) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll new file mode 100644 index 0000000000000..07ee8f840721f --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-100 < %s | FileCheck %s + +define i8 @test(ptr %g_127, i32 %0, i16 %1) { +; CHECK-LABEL: define i8 @test( +; CHECK-SAME: ptr [[G_127:%.*]], i32 [[TMP0:%.*]], i16 [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_INC434_I:.*]] +; CHECK: [[FOR_COND166_PREHEADER_I:.*]]: +; CHECK-NEXT: br label %[[FOR_INC434_I]] +; CHECK: [[FOR_INC434_I]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 60, %[[FOR_COND166_PREHEADER_I]] ] +; CHECK-NEXT: [[CONV8_I_I:%.*]] = zext nneg i32 [[TMP0]] to i64 +; CHECK-NEXT: [[DIV_I_I_1:%.*]] = udiv i64 [[CONV8_I_I]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[DIV_I_I_1]] to i16 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> poison, i64 [[CONV8_I_I]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP11]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[TMP14]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i16 [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[AND14_I_2_I_5:%.*]] = zext i16 [[OP_RDX]] to i32 +; CHECK-NEXT: store i32 [[AND14_I_2_I_5]], ptr [[G_127]], align 4 +; CHECK-NEXT: ret i8 0 +; +entry: + br label %for.inc434.i + +for.cond166.preheader.i: + br label %for.inc434.i + +for.inc434.i: + %2 = phi i64 [ 0, %entry ], [ 60, %for.cond166.preheader.i ] + %conv8.i.i = zext nneg i32 %0 to i64 + %div.i.i.1 = udiv i64 %conv8.i.i, %2 + %3 = trunc i64 %div.i.i.1 to i16 + %call12.i.2.i.1 = tail call i16 @llvm.bswap.i16(i16 %3) + %and14.i.2.i.118 = and i16 %1, %call12.i.2.i.1 + %div.i.i.2 = udiv i64 %conv8.i.i, %2 + %4 = trunc i64 %div.i.i.2 to i16 + %call12.i.i.2 = tail call i16 @llvm.bswap.i16(i16 %4) + %and14.i.i.219 = and i16 %and14.i.2.i.118, %call12.i.i.2 + %call12.i.2.i.2 = tail call i16 @llvm.bswap.i16(i16 %4) + %and14.i.2.i.220 = and i16 %and14.i.i.219, %call12.i.2.i.2 + %div.i.i.3 = udiv i64 %conv8.i.i, %2 + %5 = trunc i64 %div.i.i.3 to i16 + %call12.i.2.i.3 = tail call i16 @llvm.bswap.i16(i16 %5) + %and14.i.2.i.322 = and i16 %and14.i.2.i.220, %call12.i.2.i.3 + %div.i.i.4 = udiv i64 %conv8.i.i, %2 + %6 = trunc i64 %div.i.i.4 to i16 + %call12.i.i.4 = tail call i16 @llvm.bswap.i16(i16 %6) + %and14.i.i.423 = and i16 %and14.i.2.i.322, %call12.i.i.4 + %call12.i.2.i.4 = tail call i16 @llvm.bswap.i16(i16 %6) + %and14.i.2.i.424 = and i16 %and14.i.i.423, %call12.i.2.i.4 + %div.i.i.5 = udiv i64 %conv8.i.i, %2 + %7 = trunc i64 %div.i.i.5 to i16 + %call12.i.i.5 = tail call i16 @llvm.bswap.i16(i16 %7) + %and14.i.i.525 = and i16 %and14.i.2.i.424, %call12.i.i.5 + %call12.i.2.i.5 = tail call i16 @llvm.bswap.i16(i16 %7) + %and14.i.2.i.51 = and i16 %and14.i.i.525, %call12.i.2.i.5 + %and14.i.2.i.5 = zext i16 %and14.i.2.i.51 to i32 + store i32 %and14.i.2.i.5, ptr %g_127, align 4 + ret i8 0 +} From 54c1de20c1943a3afb55bd33d7b0334bfb4bcc22 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 13 Jan 2025 09:36:41 -0800 Subject: [PATCH 086/102] [BoundsChecking] Add guard= pass parameter (#122575) And use that as an argument for allow_ubsan_check when needed. Other ubsan checks use SanitizerKind, but those are known to the clang only. So make it a parameter in LLVM. --- .../Instrumentation/BoundsChecking.h | 1 + llvm/lib/Passes/PassBuilder.cpp | 16 ++++-- .../Instrumentation/BoundsChecking.cpp | 11 +++- .../BoundsChecking/runtimes.ll | 51 +++++++++++++++++++ 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h index 836fc907375d3..ab2dcee06551e 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h +++ b/llvm/include/llvm/Transforms/Instrumentation/BoundsChecking.h @@ -29,6 +29,7 @@ class BoundsCheckingPass : public PassInfoMixin { }; std::optional Rt; // Trap if empty. bool Merge = false; + std::optional GuardKind; // `allow_ubsan_check` argument. }; BoundsCheckingPass(Options Opts) : Opts(Opts) {} diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index aac4407740055..f923d5aabe0a0 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1315,10 +1315,18 @@ parseBoundsCheckingOptions(StringRef Params) { } else if (ParamName == "merge") { Options.Merge = true; } else { - return make_error( - formatv("invalid BoundsChecking pass parameter '{0}' ", ParamName) - .str(), - inconvertibleErrorCode()); + StringRef ParamEQ; + StringRef Val; + std::tie(ParamEQ, Val) = ParamName.split('='); + int8_t Id = 0; + if (ParamEQ == "guard" && !Val.getAsInteger(0, Id)) { + Options.GuardKind = Id; + } else { + return make_error( + formatv("invalid BoundsChecking pass parameter '{0}' ", ParamName) + .str(), + inconvertibleErrorCode()); + } } } return Options; diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 8004552250b47..609678f9979c6 100644 --- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -214,8 +214,15 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(), DL, TLI, ObjSizeEval, IRB, SE); } - if (Or) + if (Or) { + if (Opts.GuardKind) { + llvm::Value *Allow = IRB.CreateIntrinsic( + IRB.getInt1Ty(), Intrinsic::allow_ubsan_check, + {llvm::ConstantInt::getSigned(IRB.getInt8Ty(), *Opts.GuardKind)}); + Or = IRB.CreateAnd(Or, Allow); + } TrapInfo.push_back(std::make_pair(&I, Or)); + } } std::string Name; @@ -299,5 +306,7 @@ void BoundsCheckingPass::printPipeline( } if (Opts.Merge) OS << ";merge"; + if (Opts.GuardKind) + OS << ";guard=" << static_cast(*Opts.GuardKind); OS << ">"; } diff --git a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll index ccc7e93615fed..7cf78a5d54e71 100644 --- a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll +++ b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll @@ -9,6 +9,8 @@ ; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=MINRT-NOMERGE ; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=MINRTABORT-NOMERGE ; +; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=TR-GUARD +; RUN: opt < %s -passes='bounds-checking' -S | FileCheck %s --check-prefixes=RT-GUARD target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" define void @f1(i64 %x) nounwind { @@ -123,6 +125,42 @@ define void @f1(i64 %x) nounwind { ; MINRTABORT-NOMERGE: [[TRAP]]: ; MINRTABORT-NOMERGE-NEXT: call void @__ubsan_handle_local_out_of_bounds_minimal_abort() #[[ATTR2:[0-9]+]], !nosanitize [[META0]] ; MINRTABORT-NOMERGE-NEXT: unreachable, !nosanitize [[META0]] +; +; TR-GUARD-LABEL: define void @f1( +; TR-GUARD-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; TR-GUARD-NEXT: [[TMP1:%.*]] = mul i64 16, [[X]] +; TR-GUARD-NEXT: [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8 +; TR-GUARD-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]] +; TR-GUARD-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP7:%.*]] = call i1 @llvm.allow.ubsan.check(i8 3), !nosanitize [[META0]] +; TR-GUARD-NEXT: [[TMP8:%.*]] = and i1 [[TMP6]], [[TMP7]], !nosanitize [[META0]] +; TR-GUARD-NEXT: br i1 [[TMP8]], label %[[TRAP:.*]], label %[[BB9:.*]] +; TR-GUARD: [[BB9]]: +; TR-GUARD-NEXT: [[TMP10:%.*]] = load i128, ptr [[TMP2]], align 4 +; TR-GUARD-NEXT: ret void +; TR-GUARD: [[TRAP]]: +; TR-GUARD-NEXT: call void @llvm.ubsantrap(i8 3) #[[ATTR3:[0-9]+]], !nosanitize [[META0]] +; TR-GUARD-NEXT: unreachable, !nosanitize [[META0]] +; +; RT-GUARD-LABEL: define void @f1( +; RT-GUARD-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; RT-GUARD-NEXT: [[TMP1:%.*]] = mul i64 16, [[X]] +; RT-GUARD-NEXT: [[TMP2:%.*]] = alloca i128, i64 [[X]], align 8 +; RT-GUARD-NEXT: [[TMP3:%.*]] = sub i64 [[TMP1]], 0, !nosanitize [[META0:![0-9]+]] +; RT-GUARD-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16, !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP5:%.*]] = or i1 false, [[TMP4]], !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP6:%.*]] = or i1 false, [[TMP5]], !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP7:%.*]] = call i1 @llvm.allow.ubsan.check(i8 -5), !nosanitize [[META0]] +; RT-GUARD-NEXT: [[TMP8:%.*]] = and i1 [[TMP6]], [[TMP7]], !nosanitize [[META0]] +; RT-GUARD-NEXT: br i1 [[TMP8]], label %[[TRAP:.*]], label %[[BB9:.*]] +; RT-GUARD: [[BB9]]: +; RT-GUARD-NEXT: [[TMP10:%.*]] = load i128, ptr [[TMP2]], align 4 +; RT-GUARD-NEXT: ret void +; RT-GUARD: [[TRAP]]: +; RT-GUARD-NEXT: call void @__ubsan_handle_local_out_of_bounds() #[[ATTR2:[0-9]+]], !nosanitize [[META0]] +; RT-GUARD-NEXT: br label %[[BB9]], !nosanitize [[META0]] ; %1 = alloca i128, i64 %x %3 = load i128, ptr %1, align 4 @@ -154,6 +192,15 @@ define void @f1(i64 %x) nounwind { ; MINRTABORT-NOMERGE: attributes #[[ATTR1:[0-9]+]] = { noreturn nounwind } ; MINRTABORT-NOMERGE: attributes #[[ATTR2]] = { nomerge noreturn nounwind } ;. +; TR-GUARD: attributes #[[ATTR0]] = { nounwind } +; TR-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +; TR-GUARD: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind } +; TR-GUARD: attributes #[[ATTR3]] = { nomerge noreturn nounwind } +;. +; RT-GUARD: attributes #[[ATTR0]] = { nounwind } +; RT-GUARD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +; RT-GUARD: attributes #[[ATTR2]] = { nomerge nounwind } +;. ; TR: [[META0]] = !{} ;. ; RT: [[META0]] = !{} @@ -168,3 +215,7 @@ define void @f1(i64 %x) nounwind { ;. ; MINRTABORT-NOMERGE: [[META0]] = !{} ;. +; TR-GUARD: [[META0]] = !{} +;. +; RT-GUARD: [[META0]] = !{} +;. From ea135e10b3ab67e4912ae131576d27d85deced8a Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Mon, 13 Jan 2025 18:48:10 +0100 Subject: [PATCH 087/102] [InstCombine] Test for trunc in align assume (NFC) --- llvm/test/Transforms/InstCombine/assume.ll | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index 52f0adf02a396..2d7bc49b6dcae 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -34,6 +34,23 @@ define i32 @foo1(ptr %a) #0 { ret i32 %t0 } +define i32 @align_assume_trunc_cond(ptr %a) #0 { +; CHECK-LABEL: @align_assume_trunc_cond( +; CHECK-NEXT: [[T0:%.*]] = load i32, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[PTRINT]] to i1 +; CHECK-NEXT: [[MASKCOND:%.*]] = xor i1 [[TRUNC]], true +; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]]) +; CHECK-NEXT: ret i32 [[T0]] +; + %t0 = load i32, ptr %a, align 4 + %ptrint = ptrtoint ptr %a to i64 + %trunc = trunc i64 %ptrint to i1 + %maskcond = xor i1 %trunc, true + tail call void @llvm.assume(i1 %maskcond) + ret i32 %t0 +} + ; Same check as in @foo1, but make sure it works if the assume is first too. define i32 @foo2(ptr %a) #0 { From fec04dc16715899dd137a0463b49bb00a1b74f33 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 13 Jan 2025 09:55:44 -0800 Subject: [PATCH 088/102] [ubsan] Pass fsanitize-skip-hot-cutoff into -fsanitize=bounds (#122576) --- clang/lib/CodeGen/BackendUtil.cpp | 12 ++++++++ clang/lib/CodeGen/CGExpr.cpp | 4 ++- clang/test/CodeGen/allow-ubsan-check.c | 38 +++++++++++++++----------- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index bcf6db1467ffc..79e6bf3d24dff 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -85,6 +85,7 @@ #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include #include #include using namespace clang; @@ -119,6 +120,9 @@ static cl::opt ClPGOColdFuncAttr( extern cl::opt ProfileCorrelate; } // namespace llvm +namespace clang { +extern llvm::cl::opt ClSanitizeGuardChecks; +} namespace { @@ -1023,6 +1027,14 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PB.registerScalarOptimizerLateEPCallback([this](FunctionPassManager &FPM, OptimizationLevel Level) { BoundsCheckingPass::Options Options; + if (CodeGenOpts.SanitizeSkipHotCutoffs[SanitizerKind::SO_LocalBounds] || + ClSanitizeGuardChecks) { + static_assert(SanitizerKind::SO_LocalBounds <= + std::numeric_limits< + decltype(Options.GuardKind)::value_type>::max(), + "Update type of llvm.allow.ubsan.check."); + Options.GuardKind = SanitizerKind::SO_LocalBounds; + } Options.Merge = CodeGenOpts.SanitizeMergeHandlers.has(SanitizerKind::LocalBounds); if (!CodeGenOpts.SanitizeTrap.has(SanitizerKind::LocalBounds)) { diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 060d02b7f1487..6e5a21c8f01e7 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -52,11 +52,13 @@ using namespace clang; using namespace CodeGen; +namespace clang { // TODO: Introduce frontend options to enabled per sanitizers, similar to // `fsanitize-trap`. -static llvm::cl::opt ClSanitizeGuardChecks( +llvm::cl::opt ClSanitizeGuardChecks( "ubsan-guard-checks", llvm::cl::Optional, llvm::cl::desc("Guard UBSAN checks with `llvm.allow.ubsan.check()`.")); +} // namespace clang //===--------------------------------------------------------------------===// // Defines for metadata diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c index fb264ce32ab99..38b4848c1edc1 100644 --- a/clang/test/CodeGen/allow-ubsan-check.c +++ b/clang/test/CodeGen/allow-ubsan-check.c @@ -174,12 +174,14 @@ void use(double*); // CHECK-NEXT: [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16 // CHECK-NEXT: call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR7:[0-9]+]] // CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 -// CHECK-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]] -// CHECK-NEXT: br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]] -// CHECK: [[BB1]]: +// CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]] +// CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]] +// CHECK-NEXT: [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]] +// CHECK: [[BB4]]: // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]] -// CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: ret double [[TMP2]] +// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] +// CHECK-NEXT: ret double [[TMP5]] // CHECK: [[TRAP]]: // CHECK-NEXT: call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META2]] // CHECK-NEXT: unreachable, !nosanitize [[META2]] @@ -191,12 +193,14 @@ void use(double*); // TR-NEXT: [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16 // TR-NEXT: call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR6:[0-9]+]] // TR-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 -// TR-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]] -// TR-NEXT: br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]] -// TR: [[BB1]]: +// TR-NEXT: [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]] +// TR-NEXT: [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]] +// TR-NEXT: [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]] +// TR-NEXT: br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]] +// TR: [[BB4]]: // TR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]] -// TR-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]] -// TR-NEXT: ret double [[TMP2]] +// TR-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]] +// TR-NEXT: ret double [[TMP5]] // TR: [[TRAP]]: // TR-NEXT: call void @llvm.ubsantrap(i8 3) #[[ATTR5]], !nosanitize [[META2]] // TR-NEXT: unreachable, !nosanitize [[META2]] @@ -208,15 +212,17 @@ void use(double*); // REC-NEXT: [[VLA:%.*]] = alloca double, i64 [[TMP0]], align 16 // REC-NEXT: call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR5:[0-9]+]] // REC-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 -// REC-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[TMP0]], [[IDXPROM]] -// REC-NEXT: br i1 [[DOTNOT]], label %[[BB1:.*]], label %[[TRAP:.*]] -// REC: [[BB1]]: +// REC-NEXT: [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]] +// REC-NEXT: [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]] +// REC-NEXT: [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]] +// REC-NEXT: br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]] +// REC: [[BB4]]: // REC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]] -// REC-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] -// REC-NEXT: ret double [[TMP2]] +// REC-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]] +// REC-NEXT: ret double [[TMP5]] // REC: [[TRAP]]: // REC-NEXT: call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META2]] -// REC-NEXT: br label %[[BB1]], !nosanitize [[META2]] +// REC-NEXT: br label %[[BB4]], !nosanitize [[META2]] // double lbounds(int b, int i) { double a[b]; From 6abba988de374e4948c1edd920b6ec886b40b47a Mon Sep 17 00:00:00 2001 From: jimingham Date: Mon, 13 Jan 2025 10:08:50 -0800 Subject: [PATCH 089/102] The _code field in an NSError is signed, not unsigned. (#119764) The NSError summary provider was fetching and printing the `_code` field as an unsigned integer, but it's defined to be an NSInteger, which is signed. --- lldb/source/Plugins/Language/ObjC/NSError.cpp | 10 +++++----- .../TestDataFormatterObjCNSError.py | 6 ++++-- .../data-formatter/data-formatter-objc/main.m | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp index 2356bc4ef4bab..bb54044ae1d61 100644 --- a/lldb/source/Plugins/Language/ObjC/NSError.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp @@ -66,8 +66,8 @@ bool lldb_private::formatters::NSError_SummaryProvider( lldb::addr_t domain_location = ptr_value + 3 * ptr_size; Status error; - uint64_t code = process_sp->ReadUnsignedIntegerFromMemory(code_location, - ptr_size, 0, error); + int64_t code = process_sp->ReadSignedIntegerFromMemory(code_location, + ptr_size, 0, error); if (error.Fail()) return false; @@ -77,7 +77,7 @@ bool lldb_private::formatters::NSError_SummaryProvider( return false; if (!domain_str_value) { - stream.Printf("domain: nil - code: %" PRIu64, code); + stream.Printf("domain: nil - code: %" PRIi64, code); return true; } @@ -98,11 +98,11 @@ bool lldb_private::formatters::NSError_SummaryProvider( StreamString domain_str_summary; if (NSStringSummaryProvider(*domain_str_sp, domain_str_summary, options) && !domain_str_summary.Empty()) { - stream.Printf("domain: %s - code: %" PRIu64, domain_str_summary.GetData(), + stream.Printf("domain: %s - code: %" PRIi64, domain_str_summary.GetData(), code); return true; } else { - stream.Printf("domain: nil - code: %" PRIu64, code); + stream.Printf("domain: nil - code: %" PRIi64, code); return true; } } diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py index 8a052cf84ef0e..de15e5915750b 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSError.py @@ -23,10 +23,12 @@ def test_nserror_with_run_command_no_const(self): self.appkit_tester_impl(self.nserror_data_formatter_commands, False) def nserror_data_formatter_commands(self): - self.expect("frame variable nserror", substrs=['domain: @"Foobar" - code: 12']) + self.expect( + "frame variable nserror", substrs=['domain: @"Foobar" - code: -1234'] + ) self.expect( - "frame variable nserrorptr", substrs=['domain: @"Foobar" - code: 12'] + "frame variable nserrorptr", substrs=['domain: @"Foobar" - code: -1234'] ) self.expect("frame variable nserror->_userInfo", substrs=["2 key/value pairs"]) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m index 0ca5cf98bd3a5..314bada49303d 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/main.m @@ -618,7 +618,7 @@ int main(int argc, const char *argv[]) { NSDictionary *error_userInfo = @{@"a" : @1, @"b" : @2}; NSError *nserror = [[NSError alloc] initWithDomain:@"Foobar" - code:12 + code:-1234 userInfo:error_userInfo]; NSError **nserrorptr = &nserror; From 9c4286b08ad6b8d70d0c0f0a965e2484a7f0c236 Mon Sep 17 00:00:00 2001 From: joaosaffran <126493771+joaosaffran@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:31:25 -0800 Subject: [PATCH 090/102] [HLSL] Adding Flatten and Branch if attributes with test fixes (#122157) - Adding the changes from PRs: - #116331 - #121852 - Fixes test `tools/dxil-dis/debug-info.ll` - Address some missed comments in the previous PR --------- Co-authored-by: joaosaffran --- clang/include/clang/Basic/Attr.td | 10 ++ clang/lib/CodeGen/CGStmt.cpp | 6 ++ clang/lib/CodeGen/CodeGenFunction.cpp | 26 ++++- clang/lib/CodeGen/CodeGenFunction.h | 4 + clang/lib/Sema/SemaStmtAttr.cpp | 8 ++ clang/test/AST/HLSL/HLSLControlFlowHint.hlsl | 43 ++++++++ .../test/CodeGenHLSL/HLSLControlFlowHint.hlsl | 48 +++++++++ llvm/include/llvm/IR/IntrinsicsSPIRV.td | 2 +- .../Target/DirectX/DXILTranslateMetadata.cpp | 36 +++++++ .../Target/SPIRV/SPIRVInstructionSelector.cpp | 29 ++++-- llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp | 44 ++++++--- .../CodeGen/DirectX/HLSLControlFlowHint.ll | 98 +++++++++++++++++++ .../HLSLControlFlowHint-pass-check.ll | 90 +++++++++++++++++ .../SPIRV/structurizer/HLSLControlFlowHint.ll | 91 +++++++++++++++++ 14 files changed, 516 insertions(+), 19 deletions(-) create mode 100644 clang/test/AST/HLSL/HLSLControlFlowHint.hlsl create mode 100644 clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl create mode 100644 llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll create mode 100644 llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll create mode 100644 llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index c0632aaa51625..a752d94b06fad 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4353,6 +4353,16 @@ def HLSLLoopHint: StmtAttr { let Documentation = [HLSLLoopHintDocs, HLSLUnrollHintDocs]; } +def HLSLControlFlowHint: StmtAttr { + /// [branch] + /// [flatten] + let Spellings = [Microsoft<"branch">, Microsoft<"flatten">]; + let Subjects = SubjectList<[IfStmt], + ErrorDiag, "'if' statements">; + let LangOpts = [HLSL]; + let Documentation = [InternalOnly]; +} + def CapturedRecord : InheritableAttr { // This attribute has no spellings as it is only ever created implicitly. let Spellings = []; diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index f9258a396b7d0..4ba8ee1ca17d4 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -760,6 +760,8 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { bool noinline = false; bool alwaysinline = false; bool noconvergent = false; + HLSLControlFlowHintAttr::Spelling flattenOrBranch = + HLSLControlFlowHintAttr::SpellingNotCalculated; const CallExpr *musttail = nullptr; for (const auto *A : S.getAttrs()) { @@ -791,6 +793,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { Builder.CreateAssumption(AssumptionVal); } } break; + case attr::HLSLControlFlowHint: { + flattenOrBranch = cast(A)->getSemanticSpelling(); + } break; } } SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge); @@ -798,6 +803,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) { SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline); SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent); SaveAndRestore save_musttail(MustTailCall, musttail); + SaveAndRestore save_flattenOrBranch(HLSLControlFlowAttr, flattenOrBranch); EmitStmt(S.getSubStmt(), S.getAttrs()); } diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index d6f3716afabdf..11fdddba1144b 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/FPEnv.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" @@ -2086,7 +2087,30 @@ void CodeGenFunction::EmitBranchOnBoolExpr( Weights = createProfileWeights(TrueCount, CurrentCount - TrueCount); } - Builder.CreateCondBr(CondV, TrueBlock, FalseBlock, Weights, Unpredictable); + llvm::Instruction *BrInst = Builder.CreateCondBr(CondV, TrueBlock, FalseBlock, + Weights, Unpredictable); + switch (HLSLControlFlowAttr) { + case HLSLControlFlowHintAttr::Microsoft_branch: + case HLSLControlFlowHintAttr::Microsoft_flatten: { + llvm::MDBuilder MDHelper(CGM.getLLVMContext()); + + llvm::ConstantInt *BranchHintConstant = + HLSLControlFlowAttr == + HLSLControlFlowHintAttr::Spelling::Microsoft_branch + ? llvm::ConstantInt::get(CGM.Int32Ty, 1) + : llvm::ConstantInt::get(CGM.Int32Ty, 2); + + SmallVector Vals( + {MDHelper.createString("hlsl.controlflow.hint"), + MDHelper.createConstant(BranchHintConstant)}); + BrInst->setMetadata("hlsl.controlflow.hint", + llvm::MDNode::get(CGM.getLLVMContext(), Vals)); + break; + } + // This is required to avoid warnings during compilation + case HLSLControlFlowHintAttr::SpellingNotCalculated: + break; + } } /// ErrorUnsupported - Print out an error that codegen doesn't support the diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 311f2ae94d046..b115c15bf01a9 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -615,6 +615,10 @@ class CodeGenFunction : public CodeGenTypeCache { /// True if the current statement has noconvergent attribute. bool InNoConvergentAttributedStmt = false; + /// HLSL Branch attribute. + HLSLControlFlowHintAttr::Spelling HLSLControlFlowAttr = + HLSLControlFlowHintAttr::SpellingNotCalculated; + // The CallExpr within the current statement that the musttail attribute // applies to. nullptr if there is no 'musttail' on the current statement. const CallExpr *MustTailCall = nullptr; diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 106e2430de901..422d8abc1028a 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -619,6 +619,12 @@ static Attr *handleHLSLLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, return ::new (S.Context) HLSLLoopHintAttr(S.Context, A, UnrollFactor); } +static Attr *handleHLSLControlFlowHint(Sema &S, Stmt *St, const ParsedAttr &A, + SourceRange Range) { + + return ::new (S.Context) HLSLControlFlowHintAttr(S.Context, A); +} + static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { if (A.isInvalid() || A.getKind() == ParsedAttr::IgnoredAttribute) @@ -655,6 +661,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, return handleLoopHintAttr(S, St, A, Range); case ParsedAttr::AT_HLSLLoopHint: return handleHLSLLoopHintAttr(S, St, A, Range); + case ParsedAttr::AT_HLSLControlFlowHint: + return handleHLSLControlFlowHint(S, St, A, Range); case ParsedAttr::AT_OpenCLUnrollHint: return handleOpenCLUnrollHint(S, St, A, Range); case ParsedAttr::AT_Suppress: diff --git a/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl b/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl new file mode 100644 index 0000000000000..a36779c05fbc9 --- /dev/null +++ b/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -ast-dump %s | FileCheck %s + +// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used branch 'int (int)' +// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} < +// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> branch +export int branch(int X){ + int resp; + [branch] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used flatten 'int (int)' +// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} < +// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> flatten +export int flatten(int X){ + int resp; + [flatten] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used no_attr 'int (int)' +// CHECK-NOT: AttributedStmt 0x{{[0-9A-Fa-f]+}} < +// CHECK-NOT: -HLSLControlFlowHintAttr +export int no_attr(int X){ + int resp; + if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl new file mode 100644 index 0000000000000..aa13b27581850 --- /dev/null +++ b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl @@ -0,0 +1,48 @@ +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s + +// CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]]) +// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4 +// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0 +// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_BRANCH:![0-9]+]] +export int test_branch(int X){ + int resp; + [branch] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: define {{.*}} i32 {{.*}}test_flatten{{.*}}(i32 {{.*}} [[VALD:%.*]]) +// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4 +// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0 +// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_FLATTEN:![0-9]+]] +export int test_flatten(int X){ + int resp; + [flatten] if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +// CHECK: define {{.*}} i32 {{.*}}test_no_attr{{.*}}(i32 {{.*}} [[VALD:%.*]]) +// CHECK-NOT: !hlsl.controlflow.hint +export int test_no_attr(int X){ + int resp; + if (X > 0) { + resp = -X; + } else { + resp = X * 2; + } + + return resp; +} + +//CHECK: [[HINT_BRANCH]] = !{!"hlsl.controlflow.hint", i32 1} +//CHECK: [[HINT_FLATTEN]] = !{!"hlsl.controlflow.hint", i32 2} diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index b4d2dce66a6f0..37057271b6c28 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -33,7 +33,7 @@ let TargetPrefix = "spv" in { def int_spv_ptrcast : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty, llvm_i32_ty], [ImmArg>]>; def int_spv_switch : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>; def int_spv_loop_merge : Intrinsic<[], [llvm_vararg_ty]>; - def int_spv_selection_merge : Intrinsic<[], [llvm_vararg_ty]>; + def int_spv_selection_merge : Intrinsic<[], [llvm_any_ty, llvm_i32_ty], [ImmArg>]>; def int_spv_cmpxchg : Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_vararg_ty]>; def int_spv_unreachable : Intrinsic<[], []>; def int_spv_alloca : Intrinsic<[llvm_any_ty], [llvm_i8_ty], [ImmArg>]>; diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 5afe6b2d2883d..5fd5c226eef89 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -15,12 +15,14 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" @@ -300,6 +302,38 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD, return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx); } +// TODO: We might need to refactor this to be more generic, +// in case we need more metadata to be replaced. +static void translateBranchMetadata(Module &M) { + for (Function &F : M) { + for (BasicBlock &BB : F) { + Instruction *BBTerminatorInst = BB.getTerminator(); + + MDNode *HlslControlFlowMD = + BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + + if (!HlslControlFlowMD) + continue; + + assert(HlslControlFlowMD->getNumOperands() == 2 && + "invalid operands for hlsl.controlflow.hint"); + + MDBuilder MDHelper(M.getContext()); + ConstantInt *Op1 = + mdconst::extract(HlslControlFlowMD->getOperand(1)); + + SmallVector Vals( + ArrayRef{MDHelper.createString("dx.controlflow.hints"), + MDHelper.createConstant(Op1)}); + + MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals); + + BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); + BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); + } + } +} + static void translateMetadata(Module &M, DXILBindingMap &DBM, DXILResourceTypeMap &DRTM, const Resources &MDResources, @@ -372,6 +406,7 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M, const dxil::ModuleMetadataInfo MMDI = MAM.getResult(M); translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI); + translateBranchMetadata(M); return PreservedAnalyses::all(); } @@ -409,6 +444,7 @@ class DXILTranslateMetadataLegacy : public ModulePass { getAnalysis().getModuleMetadata(); translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI); + translateBranchMetadata(M); return true; } }; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index b7b32dd0d626c..1d6be7619ecf4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/IntrinsicsSPIRV.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #define DEBUG_TYPE "spirv-isel" @@ -45,6 +46,17 @@ using ExtInstList = namespace { +llvm::SPIRV::SelectionControl::SelectionControl +getSelectionOperandForImm(int Imm) { + if (Imm == 2) + return SPIRV::SelectionControl::Flatten; + if (Imm == 1) + return SPIRV::SelectionControl::DontFlatten; + if (Imm == 0) + return SPIRV::SelectionControl::None; + llvm_unreachable("Invalid immediate"); +} + #define GET_GLOBALISEL_PREDICATE_BITSET #include "SPIRVGenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATE_BITSET @@ -2818,12 +2830,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, } return MIB.constrainAllUses(TII, TRI, RBI); } - case Intrinsic::spv_loop_merge: - case Intrinsic::spv_selection_merge: { - const auto Opcode = IID == Intrinsic::spv_selection_merge - ? SPIRV::OpSelectionMerge - : SPIRV::OpLoopMerge; - auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)); + case Intrinsic::spv_loop_merge: { + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoopMerge)); for (unsigned i = 1; i < I.getNumExplicitOperands(); ++i) { assert(I.getOperand(i).isMBB()); MIB.addMBB(I.getOperand(i).getMBB()); @@ -2831,6 +2839,15 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, MIB.addImm(SPIRV::SelectionControl::None); return MIB.constrainAllUses(TII, TRI, RBI); } + case Intrinsic::spv_selection_merge: { + auto MIB = + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSelectionMerge)); + assert(I.getOperand(1).isMBB() && + "operand 1 to spv_selection_merge must be a basic block"); + MIB.addMBB(I.getOperand(1).getMBB()); + MIB.addImm(getSelectionOperandForImm(I.getOperand(2).getImm())); + return MIB.constrainAllUses(TII, TRI, RBI); + } case Intrinsic::spv_cmpxchg: return selectAtomicCmpXchg(ResVReg, ResType, I); case Intrinsic::spv_unreachable: diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 336cde4e78224..2e4343c7922f1 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -18,14 +18,16 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" -#include "llvm/IR/Analysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsSPIRV.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/InitializePasses.h" +#include "llvm/PassRegistry.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LowerMemIntrinsics.h" @@ -646,8 +648,7 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(BB.getParent(), &BB); - SmallVector Args = {MergeAddress}; - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); Modified = true; } @@ -769,10 +770,9 @@ class SPIRVStructurizer : public FunctionPass { BasicBlock *Merge = Candidates[0]; auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge); - SmallVector Args = {MergeAddress}; IRBuilder<> Builder(&BB); Builder.SetInsertPoint(BB.getTerminator()); - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); } return Modified; @@ -1105,8 +1105,7 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge); - SmallVector Args = {MergeAddress}; - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); continue; } @@ -1120,8 +1119,7 @@ class SPIRVStructurizer : public FunctionPass { Builder.SetInsertPoint(Header->getTerminator()); auto MergeAddress = BlockAddress::get(NewMerge->getParent(), NewMerge); - SmallVector Args = {MergeAddress}; - Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args}); + createOpSelectMerge(&Builder, MergeAddress); } return Modified; @@ -1208,6 +1206,27 @@ class SPIRVStructurizer : public FunctionPass { AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } + + void createOpSelectMerge(IRBuilder<> *Builder, BlockAddress *MergeAddress) { + Instruction *BBTerminatorInst = Builder->GetInsertBlock()->getTerminator(); + + MDNode *MDNode = BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + + ConstantInt *BranchHint = llvm::ConstantInt::get(Builder->getInt32Ty(), 0); + + if (MDNode) { + assert(MDNode->getNumOperands() == 2 && + "invalid metadata hlsl.controlflow.hint"); + BranchHint = mdconst::extract(MDNode->getOperand(1)); + + assert(BranchHint && "invalid metadata value for hlsl.controlflow.hint"); + } + + llvm::SmallVector Args = {MergeAddress, BranchHint}; + + Builder->CreateIntrinsic(Intrinsic::spv_selection_merge, + {MergeAddress->getType()}, {Args}); + } }; } // namespace llvm @@ -1229,8 +1248,11 @@ FunctionPass *llvm::createSPIRVStructurizerPass() { PreservedAnalyses SPIRVStructurizerWrapper::run(Function &F, FunctionAnalysisManager &AF) { - FunctionPass *StructurizerPass = createSPIRVStructurizerPass(); - if (!StructurizerPass->runOnFunction(F)) + + auto FPM = legacy::FunctionPassManager(F.getParent()); + FPM.add(createSPIRVStructurizerPass()); + + if (!FPM.run(F)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet(); diff --git a/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll b/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll new file mode 100644 index 0000000000000..6a5274429930e --- /dev/null +++ b/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll @@ -0,0 +1,98 @@ +; RUN: opt -S -dxil-op-lower -dxil-translate-metadata -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s + +; This test make sure LLVM metadata is being translated into DXIL. + + +; CHECK: define i32 @test_branch(i32 %X) +; CHECK-NOT: hlsl.controlflow.hint +; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_BRANCH:![0-9]+]] +define i32 @test_branch(i32 %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + + +; CHECK: define i32 @test_flatten(i32 %X) +; CHECK-NOT: hlsl.controlflow.hint +; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_FLATTEN:![0-9]+]] +define i32 @test_flatten(i32 %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + + +; CHECK: define i32 @test_no_attr(i32 %X) +; CHECK-NOT: hlsl.controlflow.hint +; CHECK-NOT: !dx.controlflow.hints +define i32 @test_no_attr(i32 %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} +; CHECK-NOT: hlsl.controlflow.hint +; CHECK: [[HINT_BRANCH]] = !{!"dx.controlflow.hints", i32 1} +; CHECK: [[HINT_FLATTEN]] = !{!"dx.controlflow.hints", i32 2} +!0 = !{!"hlsl.controlflow.hint", i32 1} +!1 = !{!"hlsl.controlflow.hint", i32 2} diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll new file mode 100644 index 0000000000000..9911b3119ce52 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll @@ -0,0 +1,90 @@ +; RUN: opt -passes='spirv-structurizer' -S -mtriple=spirv-unknown-unknown %s | FileCheck %s + +; CHECK-LABEL: define spir_func noundef i32 @test_branch +; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_branch, %if.end), i32 1) +; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}} +define spir_func noundef i32 @test_branch(i32 noundef %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +; CHECK-LABEL: define spir_func noundef i32 @test_flatten +; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_flatten, %if.end), i32 2) +; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}} +define spir_func noundef i32 @test_flatten(i32 noundef %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} +; CHECK-LABEL: define spir_func noundef i32 @test_no_attr +; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_no_attr, %if.end), i32 0) +; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else +define spir_func noundef i32 @test_no_attr(i32 noundef %X) { +entry: + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +!0 = !{!"hlsl.controlflow.hint", i32 1} +!1 = !{!"hlsl.controlflow.hint", i32 2} diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll new file mode 100644 index 0000000000000..848eaf70f5a19 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll @@ -0,0 +1,91 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + + +define spir_func noundef i32 @test_branch(i32 noundef %X) { +entry: +; CHECK-LABEL: ; -- Begin function test_branch +; OpSelectionMerge %[[#]] DontFlatten + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + + +define spir_func noundef i32 @test_flatten(i32 noundef %X) { +entry: +; CHECK-LABEL: ; -- Begin function test_flatten +; OpSelectionMerge %[[#]] Flatten + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1 + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +define spir_func noundef i32 @test_no_attr(i32 noundef %X) { +entry: +; CHECK-LABEL: ; -- Begin function test_no_attr +; OpSelectionMerge %[[#]] None + %X.addr = alloca i32, align 4 + %resp = alloca i32, align 4 + store i32 %X, ptr %X.addr, align 4 + %0 = load i32, ptr %X.addr, align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i32, ptr %X.addr, align 4 + %sub = sub nsw i32 0, %1 + store i32 %sub, ptr %resp, align 4 + br label %if.end + +if.else: ; preds = %entry + %2 = load i32, ptr %X.addr, align 4 + %mul = mul nsw i32 %2, 2 + store i32 %mul, ptr %resp, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %3 = load i32, ptr %resp, align 4 + ret i32 %3 +} + +!0 = !{!"hlsl.controlflow.hint", i32 1} +!1 = !{!"hlsl.controlflow.hint", i32 2} From 448569b8f9d98507ec7c83c519ebe62829d9a187 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 13 Jan 2025 10:26:15 -0800 Subject: [PATCH 091/102] [SLP][NFC]Add a test with incorrect extractelement parameter after extending with poison --- .../X86/extractelemets-extended-by-poison.ll | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll new file mode 100644 index 0000000000000..6af59aee54e55 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test() { +; CHECK-LABEL: define i32 @test() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 poison, 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> , <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP4]], <4 x i64> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[INC_3_3_I_1:%.*]] = or i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> poison) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: ret i32 [[OP_RDX]] +; +entry: + %.pre.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 24), align 8 + %.pre50.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 16), align 16 + %.pre51.i = load i64, ptr getelementptr inbounds nuw (i8, ptr null, i64 8), align 8 + %.pre52.i = load i64, ptr null, align 16 + %0 = or i64 %.pre51.i, 0 + %1 = trunc i64 %.pre.i to i32 + %2 = add i32 %1, 0 + %3 = trunc i64 %.pre50.i to i32 + %4 = add i32 %3, 0 + %5 = trunc i64 %.pre51.i to i32 + %6 = add i32 %5, 0 + %7 = trunc i64 0 to i32 + %8 = add i32 %5, 0 + %9 = add i32 %7, 0 + %10 = add i32 %1, 0 + %11 = add i32 %3, 0 + %12 = add i32 %5, 0 + %13 = add i32 %7, 0 + %14 = trunc i64 %.pre.i to i32 + %15 = add i32 %14, 0 + %16 = trunc i64 %.pre50.i to i32 + %17 = add i32 %16, 0 + %18 = trunc i64 %.pre51.i to i32 + %19 = add i32 %18, 0 + %20 = trunc i64 %.pre52.i to i32 + %conv14.1.i = or i32 %9, %13 + %21 = or i32 %conv14.1.i, %6 + %22 = or i32 %21, %8 + %23 = or i32 %22, %12 + %24 = or i32 %23, %4 + %25 = or i32 %24, %11 + %26 = or i32 %25, %2 + %27 = or i32 %26, %10 + %28 = or i32 %27, %15 + %29 = or i32 %28, %17 + %30 = or i32 %29, %19 + %31 = add i32 %14, 0 + %32 = add i32 %16, 0 + %33 = add i32 %18, 0 + %34 = add i32 %20, 0 + %35 = add i32 %14, 0 + %36 = add i32 %16, 0 + %37 = add i32 %18, 0 + %38 = add i32 %20, 0 + %39 = add i32 %14, 0 + %40 = add i32 %16, 0 + %41 = add i32 %18, 0 + %42 = add i32 %20, 0 + %inc.3.3.i.1 = or i64 %.pre52.i, 0 + %conv14.i.1 = or i32 %38, %34 + %conv14.1.i.1 = or i32 %conv14.i.1, %42 + %conv14.3.i.1 = or i32 %conv14.1.i.1, %33 + %conv14.145.i.1 = or i32 %conv14.3.i.1, %37 + %conv14.1.1.i.1 = or i32 %conv14.145.i.1, %41 + %conv14.3.1.i.1 = or i32 %conv14.1.1.i.1, %32 + %conv14.247.i.1 = or i32 %conv14.3.1.i.1, %36 + %conv14.1.2.i.1 = or i32 %conv14.247.i.1, %40 + %conv14.3.2.i.1 = or i32 %conv14.1.2.i.1, %31 + %conv14.349.i.1 = or i32 %conv14.3.2.i.1, %35 + %conv14.1.3.i.1 = or i32 %conv14.349.i.1, %39 + %conv14.3.3.i.1 = or i32 %conv14.1.3.i.1, %30 + ret i32 %conv14.3.3.i.1 +} From 5cdbaa49d4f989885183a299ec7cb02635994ee0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 13 Jan 2025 10:36:40 -0800 Subject: [PATCH 092/102] [X86] Use loaded/stored element size when parsing/printing gather/scatter pointer size in Intel syntax. (#122530) This matches binutils. --- llvm/lib/Target/X86/AsmParser/X86Operand.h | 46 +- llvm/lib/Target/X86/X86InstrAVX512.td | 80 +- llvm/lib/Target/X86/X86InstrOperands.td | 39 +- llvm/lib/Target/X86/X86InstrSSE.td | 16 +- .../MC/Disassembler/X86/apx/evex-format.txt | 2 +- .../test/MC/Disassembler/X86/intel-syntax.txt | 4 +- llvm/test/MC/X86/avx-64-intel.s | 64 +- llvm/test/MC/X86/avx512-intel.s | 448 ++++----- llvm/test/MC/X86/avx512f_vl-intel.s | 896 +++++++++--------- llvm/test/MC/X86/intel-syntax.s | 2 +- llvm/utils/TableGen/X86RecognizableInstr.cpp | 38 +- 11 files changed, 808 insertions(+), 827 deletions(-) diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 07a00af881afe..d715fd1903802 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -340,46 +340,38 @@ struct X86Operand final : public MCParsedAsmOperand { return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR; } + bool isMem32_RC128() const { + return isMem32() && isMemIndexReg(X86::XMM0, X86::XMM15); + } bool isMem64_RC128() const { return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM15); } - bool isMem128_RC128() const { - return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM15); - } - bool isMem128_RC256() const { - return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM15); + bool isMem32_RC256() const { + return isMem32() && isMemIndexReg(X86::YMM0, X86::YMM15); } - bool isMem256_RC128() const { - return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM15); - } - bool isMem256_RC256() const { - return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM15); + bool isMem64_RC256() const { + return isMem64() && isMemIndexReg(X86::YMM0, X86::YMM15); } + bool isMem32_RC128X() const { + return isMem32() && X86II::isXMMReg(Mem.IndexReg); + } bool isMem64_RC128X() const { return isMem64() && X86II::isXMMReg(Mem.IndexReg); } - bool isMem128_RC128X() const { - return isMem128() && X86II::isXMMReg(Mem.IndexReg); + bool isMem32_RC256X() const { + return isMem32() && X86II::isYMMReg(Mem.IndexReg); } - bool isMem128_RC256X() const { - return isMem128() && X86II::isYMMReg(Mem.IndexReg); + bool isMem64_RC256X() const { + return isMem64() && X86II::isYMMReg(Mem.IndexReg); } - bool isMem256_RC128X() const { - return isMem256() && X86II::isXMMReg(Mem.IndexReg); + bool isMem32_RC512() const { + return isMem32() && X86II::isZMMReg(Mem.IndexReg); } - bool isMem256_RC256X() const { - return isMem256() && X86II::isYMMReg(Mem.IndexReg); - } - bool isMem256_RC512() const { - return isMem256() && X86II::isZMMReg(Mem.IndexReg); - } - bool isMem512_RC256X() const { - return isMem512() && X86II::isYMMReg(Mem.IndexReg); - } - bool isMem512_RC512() const { - return isMem512() && X86II::isZMMReg(Mem.IndexReg); + bool isMem64_RC512() const { + return isMem64() && X86II::isZMMReg(Mem.IndexReg); } + bool isMem512_GR16() const { if (!isMem512()) return false; diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index abf016000fc8e..9d8c123185a7c 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -10279,36 +10279,36 @@ multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_gather_q_pd dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { defm NAME#D#SUFF#Z: avx512_gather, EVEX_V512, REX_W; + vy64xmem>, EVEX_V512, REX_W; defm NAME#Q#SUFF#Z: avx512_gather, EVEX_V512, REX_W; + vz64mem>, EVEX_V512, REX_W; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_gather, EVEX_V256, REX_W; + vx64xmem>, EVEX_V256, REX_W; defm NAME#Q#SUFF#Z256: avx512_gather, EVEX_V256, REX_W; + vy64xmem>, EVEX_V256, REX_W; defm NAME#D#SUFF#Z128: avx512_gather, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; defm NAME#Q#SUFF#Z128: avx512_gather, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; } } multiclass avx512_gather_d_ps dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME#D#SUFF#Z: avx512_gather, + defm NAME#D#SUFF#Z: avx512_gather, EVEX_V512; - defm NAME#Q#SUFF#Z: avx512_gather, + defm NAME#Q#SUFF#Z: avx512_gather, EVEX_V512; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_gather, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#Q#SUFF#Z256: avx512_gather, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#D#SUFF#Z128: avx512_gather, EVEX_V128; + vx32xmem>, EVEX_V128; defm NAME#Q#SUFF#Z128: avx512_gather, EVEX_V128; + vx32xmem, VK2WM>, EVEX_V128; } } @@ -10336,36 +10336,36 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain, multiclass avx512_scatter_q_pd dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { defm NAME#D#SUFF#Z: avx512_scatter, EVEX_V512, REX_W; + vy64xmem>, EVEX_V512, REX_W; defm NAME#Q#SUFF#Z: avx512_scatter, EVEX_V512, REX_W; + vz64mem>, EVEX_V512, REX_W; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_scatter, EVEX_V256, REX_W; + vx64xmem>, EVEX_V256, REX_W; defm NAME#Q#SUFF#Z256: avx512_scatter, EVEX_V256, REX_W; + vy64xmem>, EVEX_V256, REX_W; defm NAME#D#SUFF#Z128: avx512_scatter, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; defm NAME#Q#SUFF#Z128: avx512_scatter, EVEX_V128, REX_W; + vx64xmem>, EVEX_V128, REX_W; } } multiclass avx512_scatter_d_ps dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME#D#SUFF#Z: avx512_scatter, + defm NAME#D#SUFF#Z: avx512_scatter, EVEX_V512; - defm NAME#Q#SUFF#Z: avx512_scatter, + defm NAME#Q#SUFF#Z: avx512_scatter, EVEX_V512; let Predicates = [HasVLX] in { defm NAME#D#SUFF#Z256: avx512_scatter, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#Q#SUFF#Z256: avx512_scatter, EVEX_V256; + vy32xmem>, EVEX_V256; defm NAME#D#SUFF#Z128: avx512_scatter, EVEX_V128; + vx32xmem>, EVEX_V128; defm NAME#Q#SUFF#Z128: avx512_scatter, EVEX_V128; + vx32xmem, VK2WM>, EVEX_V128; } } @@ -10385,52 +10385,52 @@ multiclass avx512_gather_scatter_prefetch opc, Format F, string OpcodeSt } defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps", - VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps", - VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + VK8WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd", - VK8WM, vy512xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vy64xmem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", - VK8WM, vz512mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; + VK8WM, vz64mem>, EVEX_V512, REX_W, EVEX_CD8<64, CD8VT1>; multiclass cvt_by_vec_width opc, X86VectorVTInfo Vec, string OpcodeStr, SchedWrite Sched> { def rk : AVX512XS8I; def i512mem_GR64 : X86MemOperand<"printzmmwordmem", X86Mem512_GR64Operand, 512>; // Gather mem operands +def vx32mem : X86VMemOperand; def vx64mem : X86VMemOperand; -def vx128mem : X86VMemOperand; -def vx256mem : X86VMemOperand; -def vy128mem : X86VMemOperand; -def vy256mem : X86VMemOperand; +def vy32mem : X86VMemOperand; +def vy64mem : X86VMemOperand; +def vx32xmem : X86VMemOperand; def vx64xmem : X86VMemOperand; -def vx128xmem : X86VMemOperand; -def vx256xmem : X86VMemOperand; -def vy128xmem : X86VMemOperand; -def vy256xmem : X86VMemOperand; -def vy512xmem : X86VMemOperand; -def vz256mem : X86VMemOperand; -def vz512mem : X86VMemOperand; +def vy32xmem : X86VMemOperand; +def vy64xmem : X86VMemOperand; +def vz32mem : X86VMemOperand; +def vz64mem : X86VMemOperand; def shmem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 036d7d92f3f89..6aadb788c851e 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8078,26 +8078,26 @@ let Predicates = [HasAVX2] in { = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" in { defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", - VR256, vx128mem, vx256mem>, REX_W; + VR256, vx64mem, vx64mem>, REX_W; defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", - VR256, vx128mem, vy256mem>, REX_W; + VR256, vx64mem, vy64mem>, REX_W; defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", - VR256, vx128mem, vy256mem>; + VR256, vx32mem, vy32mem>; defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", - VR128, vx64mem, vy128mem>; + VR128, vx32mem, vy32mem>; let ExeDomain = SSEPackedDouble in { defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", - VR256, vx128mem, vx256mem>, REX_W; + VR256, vx64mem, vx64mem>, REX_W; defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", - VR256, vx128mem, vy256mem>, REX_W; + VR256, vx64mem, vy64mem>, REX_W; } let ExeDomain = SSEPackedSingle in { defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", - VR256, vx128mem, vy256mem>; + VR256, vx32mem, vy32mem>; defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", - VR128, vx64mem, vy128mem>; + VR128, vx32mem, vy32mem>; } } } diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt index e9a9f1327a17e..53ae3b8b73ab4 100644 --- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt +++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt @@ -90,7 +90,7 @@ ## MRM5m # ATT: vscatterpf0dps (%r16,%zmm0) {%k1} -# INTEL: vscatterpf0dps {k1}, zmmword ptr [r16 + zmm0] +# INTEL: vscatterpf0dps {k1}, dword ptr [r16 + zmm0] 0x62,0xfa,0x7d,0x49,0xc6,0x2c,0x00 # ATT: subq $127, 123(%r16), %r17 diff --git a/llvm/test/MC/Disassembler/X86/intel-syntax.txt b/llvm/test/MC/Disassembler/X86/intel-syntax.txt index c7c0fce268cd2..f9284ab388441 100644 --- a/llvm/test/MC/Disassembler/X86/intel-syntax.txt +++ b/llvm/test/MC/Disassembler/X86/intel-syntax.txt @@ -108,10 +108,10 @@ # CHECK: vshufpd xmm0, xmm1, xmm2, 1 0xc5 0xf1 0xc6 0xc2 0x01 -# CHECK: vpgatherqq ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 +# CHECK: vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0 0xc4 0xe2 0xfd 0x91 0x14 0x4f -# CHECK: vpgatherdd xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 +# CHECK: vpgatherdd xmm10, dword ptr [r15 + 2*xmm9], xmm8 0xc4 0x02 0x39 0x90 0x14 0x4f # CHECK: xsave64 [rax] diff --git a/llvm/test/MC/X86/avx-64-intel.s b/llvm/test/MC/X86/avx-64-intel.s index c1f20d204a8c4..392f6e9928427 100644 --- a/llvm/test/MC/X86/avx-64-intel.s +++ b/llvm/test/MC/X86/avx-64-intel.s @@ -1,68 +1,68 @@ // RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s -// CHECK: vgatherdpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vgatherdpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x92,0x14,0x4f] - vgatherdpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vgatherdpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vgatherqpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vgatherqpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x93,0x14,0x4f] - vgatherqpd xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vgatherqpd xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vgatherdpd ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 +// CHECK: vgatherdpd ymm2, qword ptr [rdi + 2*xmm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x92,0x14,0x4f] - vgatherdpd ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 + vgatherdpd ymm2, qword ptr [rdi + 2*xmm1], ymm0 -// CHECK: vgatherqpd ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 +// CHECK: vgatherqpd ymm2, qword ptr [rdi + 2*ymm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x93,0x14,0x4f] - vgatherqpd ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 + vgatherqpd ymm2, qword ptr [rdi + 2*ymm1], ymm0 -// CHECK: vgatherdps xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vgatherdps xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x92,0x14,0x4f] - vgatherdps xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 + vgatherdps xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vgatherqps xmm10, qword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vgatherqps xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x93,0x14,0x4f] - vgatherqps xmm10, qword ptr [r15 + 2*xmm9], xmm8 + vgatherqps xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vgatherdps ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 +// CHECK: vgatherdps ymm10, dword ptr [r15 + 2*ymm9], ymm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x92,0x14,0x4f] - vgatherdps ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 + vgatherdps ymm10, dword ptr [r15 + 2*ymm9], ymm8 -// CHECK: vgatherqps xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 +// CHECK: vgatherqps xmm10, dword ptr [r15 + 2*ymm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x93,0x14,0x4f] - vgatherqps xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 + vgatherqps xmm10, dword ptr [r15 + 2*ymm9], xmm8 -// CHECK: vpgatherdq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vpgatherdq xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x90,0x14,0x4f] - vpgatherdq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vpgatherdq xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vpgatherqq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 +// CHECK: vpgatherqq xmm2, qword ptr [rdi + 2*xmm1], xmm0 // CHECK: encoding: [0xc4,0xe2,0xf9,0x91,0x14,0x4f] - vpgatherqq xmm2, xmmword ptr [rdi + 2*xmm1], xmm0 + vpgatherqq xmm2, qword ptr [rdi + 2*xmm1], xmm0 -// CHECK: vpgatherdq ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 +// CHECK: vpgatherdq ymm2, qword ptr [rdi + 2*xmm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x90,0x14,0x4f] - vpgatherdq ymm2, ymmword ptr [rdi + 2*xmm1], ymm0 + vpgatherdq ymm2, qword ptr [rdi + 2*xmm1], ymm0 -// CHECK: vpgatherqq ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 +// CHECK: vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0 // CHECK: encoding: [0xc4,0xe2,0xfd,0x91,0x14,0x4f] - vpgatherqq ymm2, ymmword ptr [rdi + 2*ymm1], ymm0 + vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0 -// CHECK: vpgatherdd xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vpgatherdd xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x90,0x14,0x4f] - vpgatherdd xmm10, xmmword ptr [r15 + 2*xmm9], xmm8 + vpgatherdd xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vpgatherqd xmm10, qword ptr [r15 + 2*xmm9], xmm8 +// CHECK: vpgatherqd xmm10, dword ptr [r15 + 2*xmm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x39,0x91,0x14,0x4f] - vpgatherqd xmm10, qword ptr [r15 + 2*xmm9], xmm8 + vpgatherqd xmm10, dword ptr [r15 + 2*xmm9], xmm8 -// CHECK: vpgatherdd ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 +// CHECK: vpgatherdd ymm10, dword ptr [r15 + 2*ymm9], ymm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x90,0x14,0x4f] - vpgatherdd ymm10, ymmword ptr [r15 + 2*ymm9], ymm8 + vpgatherdd ymm10, dword ptr [r15 + 2*ymm9], ymm8 -// CHECK: vpgatherqd xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 +// CHECK: vpgatherqd xmm10, dword ptr [r15 + 2*ymm9], xmm8 // CHECK: encoding: [0xc4,0x02,0x3d,0x91,0x14,0x4f] - vpgatherqd xmm10, xmmword ptr [r15 + 2*ymm9], xmm8 + vpgatherqd xmm10, dword ptr [r15 + 2*ymm9], xmm8 // CHECK: vcvtpd2ps xmm0, xmm15 // CHECK: encoding: [0xc4,0xc1,0x79,0x5a,0xc7] diff --git a/llvm/test/MC/X86/avx512-intel.s b/llvm/test/MC/X86/avx512-intel.s index d8ad3c4426176..1cbf21c7eb1b0 100644 --- a/llvm/test/MC/X86/avx512-intel.s +++ b/llvm/test/MC/X86/avx512-intel.s @@ -37900,450 +37900,450 @@ vaddpd zmm1, zmm1, zmm2, {rz-sae} // CHECK: encoding: [0x62,0xe2,0xa5,0x50,0x77,0xaa,0xf8,0xfb,0xff,0xff] vpermi2pd zmm21, zmm27, qword ptr [rdx - 1032]{1to8} -// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [r14 + 8*ymm16 + 123] +// CHECK: vgatherdpd zmm6 {k1}, qword ptr [r14 + 8*ymm16 + 123] // CHECK: encoding: [0x62,0xd2,0xfd,0x41,0x92,0xb4,0xc6,0x7b,0x00,0x00,0x00] - vgatherdpd zmm6 {k1},ZMMWORD PTR [r14+ymm16*8+0x7b] + vgatherdpd zmm6 {k1},QWORD PTR [r14+ymm16*8+0x7b] -// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [r9 + ymm16 + 256] +// CHECK: vgatherdpd zmm6 {k1}, qword ptr [r9 + ymm16 + 256] // CHECK: encoding: [0x62,0xd2,0xfd,0x41,0x92,0x74,0x01,0x20] - vgatherdpd zmm6{k1},ZMMWORD PTR [r9+ymm16*1+0x100] + vgatherdpd zmm6{k1},QWORD PTR [r9+ymm16*1+0x100] -// CHECK: vgatherdpd zmm6 {k1}, zmmword ptr [rcx + 4*ymm16 + 1024] +// CHECK: vgatherdpd zmm6 {k1}, qword ptr [rcx + 4*ymm16 + 1024] // CHECK: encoding: [0x62,0xf2,0xfd,0x41,0x92,0xb4,0x81,0x00,0x04,0x00,0x00] - vgatherdpd zmm6{k1},ZMMWORD PTR [rcx+ymm16*4+0x400] + vgatherdpd zmm6{k1},QWORD PTR [rcx+ymm16*4+0x400] -// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [r14 + 8*zmm19 + 123] +// CHECK: vgatherdps zmm9 {k1}, dword ptr [r14 + 8*zmm19 + 123] // CHECK: encoding: [0x62,0x52,0x7d,0x41,0x92,0x8c,0xde,0x7b,0x00,0x00,0x00] - vgatherdps zmm9{k1},ZMMWORD PTR [r14+zmm19*8+0x7b] + vgatherdps zmm9{k1},DWORD PTR [r14+zmm19*8+0x7b] -// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [r9 + zmm19 + 256] +// CHECK: vgatherdps zmm9 {k1}, dword ptr [r9 + zmm19 + 256] // CHECK: encoding: [0x62,0x52,0x7d,0x41,0x92,0x4c,0x19,0x40] - vgatherdps zmm9{k1},ZMMWORD PTR [r9+zmm19*1+0x100] + vgatherdps zmm9{k1},DWORD PTR [r9+zmm19*1+0x100] -// CHECK: vgatherdps zmm9 {k1}, zmmword ptr [rcx + 4*zmm19 + 1024] +// CHECK: vgatherdps zmm9 {k1}, dword ptr [rcx + 4*zmm19 + 1024] // CHECK: encoding: [0x62,0x72,0x7d,0x41,0x92,0x8c,0x99,0x00,0x04,0x00,0x00] - vgatherdps zmm9{k1},ZMMWORD PTR [rcx+zmm19*4+0x400] + vgatherdps zmm9{k1},DWORD PTR [rcx+zmm19*4+0x400] -// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [r14 + 8*zmm2 + 123] +// CHECK: vgatherqpd zmm29 {k1}, qword ptr [r14 + 8*zmm2 + 123] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x93,0xac,0xd6,0x7b,0x00,0x00,0x00] - vgatherqpd zmm29{k1},ZMMWORD PTR [r14+zmm2*8+0x7b] + vgatherqpd zmm29{k1},QWORD PTR [r14+zmm2*8+0x7b] -// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [r9 + zmm2 + 256] +// CHECK: vgatherqpd zmm29 {k1}, qword ptr [r9 + zmm2 + 256] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x93,0x6c,0x11,0x20] - vgatherqpd zmm29{k1},ZMMWORD PTR [r9+zmm2*1+0x100] + vgatherqpd zmm29{k1},QWORD PTR [r9+zmm2*1+0x100] -// CHECK: vgatherqpd zmm29 {k1}, zmmword ptr [rcx + 4*zmm2 + 1024] +// CHECK: vgatherqpd zmm29 {k1}, qword ptr [rcx + 4*zmm2 + 1024] // CHECK: encoding: [0x62,0x62,0xfd,0x49,0x93,0xac,0x91,0x00,0x04,0x00,0x00] - vgatherqpd zmm29{k1},ZMMWORD PTR [rcx+zmm2*4+0x400] + vgatherqpd zmm29{k1},QWORD PTR [rcx+zmm2*4+0x400] -// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [r14 + 8*zmm4 + 123] +// CHECK: vgatherqps ymm18 {k1}, dword ptr [r14 + 8*zmm4 + 123] // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0x93,0x94,0xe6,0x7b,0x00,0x00,0x00] - vgatherqps ymm18{k1},YMMWORD PTR [r14+zmm4*8+0x7b] + vgatherqps ymm18{k1},DWORD PTR [r14+zmm4*8+0x7b] -// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [r9 + zmm4 + 256] +// CHECK: vgatherqps ymm18 {k1}, dword ptr [r9 + zmm4 + 256] // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0x93,0x54,0x21,0x40] - vgatherqps ymm18{k1},YMMWORD PTR [r9+zmm4*1+0x100] + vgatherqps ymm18{k1},DWORD PTR [r9+zmm4*1+0x100] -// CHECK: vgatherqps ymm18 {k1}, ymmword ptr [rcx + 4*zmm4 + 1024] +// CHECK: vgatherqps ymm18 {k1}, dword ptr [rcx + 4*zmm4 + 1024] // CHECK: encoding: [0x62,0xe2,0x7d,0x49,0x93,0x94,0xa1,0x00,0x04,0x00,0x00] - vgatherqps ymm18{k1},YMMWORD PTR [rcx+zmm4*4+0x400] + vgatherqps ymm18{k1},DWORD PTR [rcx+zmm4*4+0x400] -// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [r14 + 8*zmm11 + 123] +// CHECK: vpgatherdd zmm17 {k1}, dword ptr [r14 + 8*zmm11 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x8c,0xde,0x7b,0x00,0x00,0x00] - vpgatherdd zmm17{k1},ZMMWORD PTR [r14+zmm11*8+0x7b] + vpgatherdd zmm17{k1},DWORD PTR [r14+zmm11*8+0x7b] -// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [r9 + zmm11 + 256] +// CHECK: vpgatherdd zmm17 {k1}, dword ptr [r9 + zmm11 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x49,0x90,0x4c,0x19,0x40] - vpgatherdd zmm17{k1},ZMMWORD PTR [r9+zmm11*1+0x100] + vpgatherdd zmm17{k1},DWORD PTR [r9+zmm11*1+0x100] -// CHECK: vpgatherdd zmm17 {k1}, zmmword ptr [rcx + 4*zmm11 + 1024] +// CHECK: vpgatherdd zmm17 {k1}, dword ptr [rcx + 4*zmm11 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x49,0x90,0x8c,0x99,0x00,0x04,0x00,0x00] - vpgatherdd zmm17{k1},ZMMWORD PTR [rcx+zmm11*4+0x400] + vpgatherdd zmm17{k1},DWORD PTR [rcx+zmm11*4+0x400] -// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [r14 + 8*ymm14 + 123] +// CHECK: vpgatherdq zmm8 {k1}, qword ptr [r14 + 8*ymm14 + 123] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x84,0xf6,0x7b,0x00,0x00,0x00] - vpgatherdq zmm8{k1},ZMMWORD PTR [r14+ymm14*8+0x7b] + vpgatherdq zmm8{k1},QWORD PTR [r14+ymm14*8+0x7b] -// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [r9 + ymm14 + 256] +// CHECK: vpgatherdq zmm8 {k1}, qword ptr [r9 + ymm14 + 256] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x90,0x44,0x31,0x20] - vpgatherdq zmm8{k1},ZMMWORD PTR [r9+ymm14*1+0x100] + vpgatherdq zmm8{k1},QWORD PTR [r9+ymm14*1+0x100] -// CHECK: vpgatherdq zmm8 {k1}, zmmword ptr [rcx + 4*ymm14 + 1024] +// CHECK: vpgatherdq zmm8 {k1}, qword ptr [rcx + 4*ymm14 + 1024] // CHECK: encoding: [0x62,0x32,0xfd,0x49,0x90,0x84,0xb1,0x00,0x04,0x00,0x00] - vpgatherdq zmm8{k1},ZMMWORD PTR [rcx+ymm14*4+0x400] + vpgatherdq zmm8{k1},QWORD PTR [rcx+ymm14*4+0x400] -// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [r14 + 8*zmm17 + 123] +// CHECK: vpgatherqd ymm3 {k1}, dword ptr [r14 + 8*zmm17 + 123] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x9c,0xce,0x7b,0x00,0x00,0x00] - vpgatherqd ymm3{k1},YMMWORD PTR [r14+zmm17*8+0x7b] + vpgatherqd ymm3{k1},DWORD PTR [r14+zmm17*8+0x7b] -// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [r9 + zmm17 + 256] +// CHECK: vpgatherqd ymm3 {k1}, dword ptr [r9 + zmm17 + 256] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x91,0x5c,0x09,0x40] - vpgatherqd ymm3{k1},YMMWORD PTR [r9+zmm17*1+0x100] + vpgatherqd ymm3{k1},DWORD PTR [r9+zmm17*1+0x100] -// CHECK: vpgatherqd ymm3 {k1}, ymmword ptr [rcx + 4*zmm17 + 1024] +// CHECK: vpgatherqd ymm3 {k1}, dword ptr [rcx + 4*zmm17 + 1024] // CHECK: encoding: [0x62,0xf2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00] - vpgatherqd ymm3{k1},YMMWORD PTR [rcx+zmm17*4+0x400] + vpgatherqd ymm3{k1},DWORD PTR [rcx+zmm17*4+0x400] -// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [r14 + 8*zmm21 + 123] +// CHECK: vpgatherqq zmm17 {k1}, qword ptr [r14 + 8*zmm21 + 123] // CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x8c,0xee,0x7b,0x00,0x00,0x00] - vpgatherqq zmm17{k1},ZMMWORD PTR [r14+zmm21*8+0x7b] + vpgatherqq zmm17{k1},QWORD PTR [r14+zmm21*8+0x7b] -// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [r9 + zmm21 + 256] +// CHECK: vpgatherqq zmm17 {k1}, qword ptr [r9 + zmm21 + 256] // CHECK: encoding: [0x62,0xc2,0xfd,0x41,0x91,0x4c,0x29,0x20] - vpgatherqq zmm17{k1},ZMMWORD PTR [r9+zmm21*1+0x100] + vpgatherqq zmm17{k1},QWORD PTR [r9+zmm21*1+0x100] -// CHECK: vpgatherqq zmm17 {k1}, zmmword ptr [rcx + 4*zmm21 + 1024] +// CHECK: vpgatherqq zmm17 {k1}, qword ptr [rcx + 4*zmm21 + 1024] // CHECK: encoding: [0x62,0xe2,0xfd,0x41,0x91,0x8c,0xa9,0x00,0x04,0x00,0x00] - vpgatherqq zmm17{k1},ZMMWORD PTR [rcx+zmm21*4+0x400] + vpgatherqq zmm17{k1},QWORD PTR [rcx+zmm21*4+0x400] -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00] - vpscatterdd ZMMWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 + vpscatterdd DWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm16 + 123] {k1}, zmm19 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x9c,0xc6,0x7b,0x00,0x00,0x00] - vpscatterdd ZMMWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 + vpscatterdd DWORD PTR [r14+zmm16*8+0x7b]{k1},zmm19 -// CHECK: vpscatterdd zmmword ptr [r9 + zmm16 + 256] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [r9 + zmm16 + 256] {k1}, zmm19 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa0,0x5c,0x01,0x40] - vpscatterdd ZMMWORD PTR [r9+zmm16*1+0x100]{k1},zmm19 + vpscatterdd DWORD PTR [r9+zmm16*1+0x100]{k1},zmm19 -// CHECK: vpscatterdd zmmword ptr [rcx + 4*zmm16 + 1024] {k1}, zmm19 +// CHECK: vpscatterdd dword ptr [rcx + 4*zmm16 + 1024] {k1}, zmm19 // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa0,0x9c,0x81,0x00,0x04,0x00,0x00] - vpscatterdd ZMMWORD PTR [rcx+zmm16*4+0x400]{k1},zmm19 + vpscatterdd DWORD PTR [rcx+zmm16*4+0x400]{k1},zmm19 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00] - vpscatterdq ZMMWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 + vpscatterdq QWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm6 + 123] {k1}, zmm5 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0xac,0xf6,0x7b,0x00,0x00,0x00] - vpscatterdq ZMMWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 + vpscatterdq QWORD PTR [r14+ymm6*8+0x7b]{k1},zmm5 -// CHECK: vpscatterdq zmmword ptr [r9 + ymm6 + 256] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [r9 + ymm6 + 256] {k1}, zmm5 // CHECK: encoding: [0x62,0xd2,0xfd,0x49,0xa0,0x6c,0x31,0x20] - vpscatterdq ZMMWORD PTR [r9+ymm6*1+0x100]{k1},zmm5 + vpscatterdq QWORD PTR [r9+ymm6*1+0x100]{k1},zmm5 -// CHECK: vpscatterdq zmmword ptr [rcx + 4*ymm6 + 1024] {k1}, zmm5 +// CHECK: vpscatterdq qword ptr [rcx + 4*ymm6 + 1024] {k1}, zmm5 // CHECK: encoding: [0x62,0xf2,0xfd,0x49,0xa0,0xac,0xb1,0x00,0x04,0x00,0x00] - vpscatterdq ZMMWORD PTR [rcx+ymm6*4+0x400]{k1},zmm5 + vpscatterdq QWORD PTR [rcx+ymm6*4+0x400]{k1},zmm5 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00] - vpscatterqd YMMWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 + vpscatterqd DWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm2 + 123] {k1}, ymm20 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0xa4,0xd6,0x7b,0x00,0x00,0x00] - vpscatterqd YMMWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 + vpscatterqd DWORD PTR [r14+zmm2*8+0x7b]{k1},ymm20 -// CHECK: vpscatterqd ymmword ptr [r9 + zmm2 + 256] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [r9 + zmm2 + 256] {k1}, ymm20 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa1,0x64,0x11,0x40] - vpscatterqd YMMWORD PTR [r9+zmm2*1+0x100]{k1},ymm20 + vpscatterqd DWORD PTR [r9+zmm2*1+0x100]{k1},ymm20 -// CHECK: vpscatterqd ymmword ptr [rcx + 4*zmm2 + 1024] {k1}, ymm20 +// CHECK: vpscatterqd dword ptr [rcx + 4*zmm2 + 1024] {k1}, ymm20 // CHECK: encoding: [0x62,0xe2,0x7d,0x49,0xa1,0xa4,0x91,0x00,0x04,0x00,0x00] - vpscatterqd YMMWORD PTR [rcx+zmm2*4+0x400]{k1},ymm20 + vpscatterqd DWORD PTR [rcx+zmm2*4+0x400]{k1},ymm20 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vpscatterqq ZMMWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 + vpscatterqq QWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm20 + 123] {k1}, zmm14 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vpscatterqq ZMMWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 + vpscatterqq QWORD PTR [r14+zmm20*8+0x7b]{k1},zmm14 -// CHECK: vpscatterqq zmmword ptr [r9 + zmm20 + 256] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [r9 + zmm20 + 256] {k1}, zmm14 // CHECK: encoding: [0x62,0x52,0xfd,0x41,0xa1,0x74,0x21,0x20] - vpscatterqq ZMMWORD PTR [r9+zmm20*1+0x100]{k1},zmm14 + vpscatterqq QWORD PTR [r9+zmm20*1+0x100]{k1},zmm14 -// CHECK: vpscatterqq zmmword ptr [rcx + 4*zmm20 + 1024] {k1}, zmm14 +// CHECK: vpscatterqq qword ptr [rcx + 4*zmm20 + 1024] {k1}, zmm14 // CHECK: encoding: [0x62,0x72,0xfd,0x41,0xa1,0xb4,0xa1,0x00,0x04,0x00,0x00] - vpscatterqq ZMMWORD PTR [rcx+zmm20*4+0x400]{k1},zmm14 + vpscatterqq QWORD PTR [rcx+zmm20*4+0x400]{k1},zmm14 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xc6,0x7b,0x00,0x00,0x00] - vscatterdpd ZMMWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm24 + 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xc6,0x7b,0x00,0x00,0x00] - vscatterdpd ZMMWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm24*8+0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r9 + ymm24 + 256] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r9 + ymm24 + 256] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x54,0x01,0x20] - vscatterdpd ZMMWORD PTR [r9+ymm24*1+0x100]{k1},zmm18 + vscatterdpd QWORD PTR [r9+ymm24*1+0x100]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [rcx + 4*ymm24 + 1024] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [rcx + 4*ymm24 + 1024] {k1}, zmm18 // CHECK: encoding: [0x62,0xa2,0xfd,0x41,0xa2,0x94,0x81,0x00,0x04,0x00,0x00] - vscatterdpd ZMMWORD PTR [rcx+ymm24*4+0x400]{k1},zmm18 + vscatterdpd QWORD PTR [rcx+ymm24*4+0x400]{k1},zmm18 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x8c,0xde,0x7b,0x00,0x00,0x00] - vscatterdps ZMMWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 + vscatterdps DWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm19 + 123] {k1}, zmm17 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x8c,0xde,0x7b,0x00,0x00,0x00] - vscatterdps ZMMWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 + vscatterdps DWORD PTR [r14+zmm19*8+0x7b]{k1},zmm17 -// CHECK: vscatterdps zmmword ptr [r9 + zmm19 + 256] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [r9 + zmm19 + 256] {k1}, zmm17 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa2,0x4c,0x19,0x40] - vscatterdps ZMMWORD PTR [r9+zmm19*1+0x100]{k1},zmm17 + vscatterdps DWORD PTR [r9+zmm19*1+0x100]{k1},zmm17 -// CHECK: vscatterdps zmmword ptr [rcx + 4*zmm19 + 1024] {k1}, zmm17 +// CHECK: vscatterdps dword ptr [rcx + 4*zmm19 + 1024] {k1}, zmm17 // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa2,0x8c,0x99,0x00,0x04,0x00,0x00] - vscatterdps ZMMWORD PTR [rcx+zmm19*4+0x400]{k1},zmm17 + vscatterdps DWORD PTR [rcx+zmm19*4+0x400]{k1},zmm17 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa3,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vscatterqpd ZMMWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 + vscatterqpd QWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm28 + 123] {k1}, zmm22 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa3,0xb4,0xe6,0x7b,0x00,0x00,0x00] - vscatterqpd ZMMWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 + vscatterqpd QWORD PTR [r14+zmm28*8+0x7b]{k1},zmm22 -// CHECK: vscatterqpd zmmword ptr [r9 + zmm28 + 256] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [r9 + zmm28 + 256] {k1}, zmm22 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa3,0x74,0x21,0x20] - vscatterqpd ZMMWORD PTR [r9+zmm28*1+0x100]{k1},zmm22 + vscatterqpd QWORD PTR [r9+zmm28*1+0x100]{k1},zmm22 -// CHECK: vscatterqpd zmmword ptr [rcx + 4*zmm28 + 1024] {k1}, zmm22 +// CHECK: vscatterqpd qword ptr [rcx + 4*zmm28 + 1024] {k1}, zmm22 // CHECK: encoding: [0x62,0xa2,0xfd,0x41,0xa3,0xb4,0xa1,0x00,0x04,0x00,0x00] - vscatterqpd ZMMWORD PTR [rcx+zmm28*4+0x400]{k1},zmm22 + vscatterqpd QWORD PTR [rcx+zmm28*4+0x400]{k1},zmm22 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 // CHECK: encoding: [0x62,0x92,0x7d,0x41,0xa3,0xb4,0xde,0x7b,0x00,0x00,0x00] - vscatterqps YMMWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 + vscatterqps DWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm27 + 123] {k1}, ymm6 // CHECK: encoding: [0x62,0x92,0x7d,0x41,0xa3,0xb4,0xde,0x7b,0x00,0x00,0x00] - vscatterqps YMMWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 + vscatterqps DWORD PTR [r14+zmm27*8+0x7b]{k1},ymm6 -// CHECK: vscatterqps ymmword ptr [r9 + zmm27 + 256] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [r9 + zmm27 + 256] {k1}, ymm6 // CHECK: encoding: [0x62,0x92,0x7d,0x41,0xa3,0x74,0x19,0x40] - vscatterqps YMMWORD PTR [r9+zmm27*1+0x100]{k1},ymm6 + vscatterqps DWORD PTR [r9+zmm27*1+0x100]{k1},ymm6 -// CHECK: vscatterqps ymmword ptr [rcx + 4*zmm27 + 1024] {k1}, ymm6 +// CHECK: vscatterqps dword ptr [rcx + 4*zmm27 + 1024] {k1}, ymm6 // CHECK: encoding: [0x62,0xb2,0x7d,0x41,0xa3,0xb4,0x99,0x00,0x04,0x00,0x00] - vscatterqps YMMWORD PTR [rcx+zmm27*4+0x400]{k1},ymm6 + vscatterqps DWORD PTR [rcx+zmm27*4+0x400]{k1},ymm6 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xde,0x85,0xff,0xff,0xff] - vscatterdpd ZMMWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*ymm27 - 123] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x94,0xde,0x85,0xff,0xff,0xff] - vscatterdpd ZMMWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 + vscatterdpd QWORD PTR [r14+ymm27*8-0x7b]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [r9 + ymm27 + 256] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [r9 + ymm27 + 256] {k1}, zmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x41,0xa2,0x54,0x19,0x20] - vscatterdpd ZMMWORD PTR [r9+ymm27*1+0x100]{k1},zmm18 + vscatterdpd QWORD PTR [r9+ymm27*1+0x100]{k1},zmm18 -// CHECK: vscatterdpd zmmword ptr [rcx + 4*ymm27 + 1024] {k1}, zmm18 +// CHECK: vscatterdpd qword ptr [rcx + 4*ymm27 + 1024] {k1}, zmm18 // CHECK: encoding: [0x62,0xa2,0xfd,0x41,0xa2,0x94,0x99,0x00,0x04,0x00,0x00] - vscatterdpd ZMMWORD PTR [rcx+ymm27*4+0x400]{k1},zmm18 + vscatterdpd QWORD PTR [rcx+ymm27*4+0x400]{k1},zmm18 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x8c,0xce,0x85,0xff,0xff,0xff] - vscatterdps ZMMWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 + vscatterdps DWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 -// CHECK: vscatterdps zmmword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [r14 + 8*zmm17 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x8c,0xce,0x85,0xff,0xff,0xff] - vscatterdps ZMMWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 + vscatterdps DWORD PTR [r14+zmm17*8-0x7b]{k1},zmm1 -// CHECK: vscatterdps zmmword ptr [r9 + zmm17 + 256] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [r9 + zmm17 + 256] {k1}, zmm1 // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0xa2,0x4c,0x09,0x40] - vscatterdps ZMMWORD PTR [r9+zmm17*1+0x100]{k1},zmm1 + vscatterdps DWORD PTR [r9+zmm17*1+0x100]{k1},zmm1 -// CHECK: vscatterdps zmmword ptr [rcx + 4*zmm17 + 1024] {k1}, zmm1 +// CHECK: vscatterdps dword ptr [rcx + 4*zmm17 + 1024] {k1}, zmm1 // CHECK: encoding: [0x62,0xf2,0x7d,0x41,0xa2,0x8c,0x89,0x00,0x04,0x00,0x00] - vscatterdps ZMMWORD PTR [rcx+zmm17*4+0x400]{k1},zmm1 + vscatterdps DWORD PTR [rcx+zmm17*4+0x400]{k1},zmm1 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 // CHECK: encoding: [0x62,0x12,0xfd,0x41,0xa3,0x84,0xce,0x85,0xff,0xff,0xff] - vscatterqpd ZMMWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 + vscatterqpd QWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 -// CHECK: vscatterqpd zmmword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [r14 + 8*zmm25 - 123] {k1}, zmm8 // CHECK: encoding: [0x62,0x12,0xfd,0x41,0xa3,0x84,0xce,0x85,0xff,0xff,0xff] - vscatterqpd ZMMWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 + vscatterqpd QWORD PTR [r14+zmm25*8-0x7b]{k1},zmm8 -// CHECK: vscatterqpd zmmword ptr [r9 + zmm25 + 256] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [r9 + zmm25 + 256] {k1}, zmm8 // CHECK: encoding: [0x62,0x12,0xfd,0x41,0xa3,0x44,0x09,0x20] - vscatterqpd ZMMWORD PTR [r9+zmm25*1+0x100]{k1},zmm8 + vscatterqpd QWORD PTR [r9+zmm25*1+0x100]{k1},zmm8 -// CHECK: vscatterqpd zmmword ptr [rcx + 4*zmm25 + 1024] {k1}, zmm8 +// CHECK: vscatterqpd qword ptr [rcx + 4*zmm25 + 1024] {k1}, zmm8 // CHECK: encoding: [0x62,0x32,0xfd,0x41,0xa3,0x84,0x89,0x00,0x04,0x00,0x00] - vscatterqpd ZMMWORD PTR [rcx+zmm25*4+0x400]{k1},zmm8 + vscatterqpd QWORD PTR [rcx+zmm25*4+0x400]{k1},zmm8 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 // CHECK: encoding: [0x62,0x12,0x7d,0x49,0xa3,0xac,0xd6,0x85,0xff,0xff,0xff] - vscatterqps YMMWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 + vscatterqps DWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 -// CHECK: vscatterqps ymmword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [r14 + 8*zmm10 - 123] {k1}, ymm13 // CHECK: encoding: [0x62,0x12,0x7d,0x49,0xa3,0xac,0xd6,0x85,0xff,0xff,0xff] - vscatterqps YMMWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 + vscatterqps DWORD PTR [r14+zmm10*8-0x7b]{k1},ymm13 -// CHECK: vscatterqps ymmword ptr [r9 + zmm10 + 256] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [r9 + zmm10 + 256] {k1}, ymm13 // CHECK: encoding: [0x62,0x12,0x7d,0x49,0xa3,0x6c,0x11,0x40] - vscatterqps YMMWORD PTR [r9+zmm10*1+0x100]{k1},ymm13 + vscatterqps DWORD PTR [r9+zmm10*1+0x100]{k1},ymm13 -// CHECK: vscatterqps ymmword ptr [rcx + 4*zmm10 + 1024] {k1}, ymm13 +// CHECK: vscatterqps dword ptr [rcx + 4*zmm10 + 1024] {k1}, ymm13 // CHECK: encoding: [0x62,0x32,0x7d,0x49,0xa3,0xac,0x91,0x00,0x04,0x00,0x00] - vscatterqps YMMWORD PTR [rcx+zmm10*4+0x400]{k1},ymm13 + vscatterqps DWORD PTR [rcx+zmm10*4+0x400]{k1},ymm13 -// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [r14 + 8*ymm5 - 123] +// CHECK: vgatherdpd zmm30 {k1}, qword ptr [r14 + 8*ymm5 - 123] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x92,0xb4,0xee,0x85,0xff,0xff,0xff] - vgatherdpd zmm30{k1},ZMMWORD PTR [r14+ymm5*8-0x7b] + vgatherdpd zmm30{k1},QWORD PTR [r14+ymm5*8-0x7b] -// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [r9 + ymm5 + 256] +// CHECK: vgatherdpd zmm30 {k1}, qword ptr [r9 + ymm5 + 256] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x92,0x74,0x29,0x20] - vgatherdpd zmm30{k1},ZMMWORD PTR [r9+ymm5*1+0x100] + vgatherdpd zmm30{k1},QWORD PTR [r9+ymm5*1+0x100] -// CHECK: vgatherdpd zmm30 {k1}, zmmword ptr [rcx + 4*ymm5 + 1024] +// CHECK: vgatherdpd zmm30 {k1}, qword ptr [rcx + 4*ymm5 + 1024] // CHECK: encoding: [0x62,0x62,0xfd,0x49,0x92,0xb4,0xa9,0x00,0x04,0x00,0x00] - vgatherdpd zmm30{k1},ZMMWORD PTR [rcx+ymm5*4+0x400] + vgatherdpd zmm30{k1},QWORD PTR [rcx+ymm5*4+0x400] -// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [r14 + 8*zmm26 - 123] +// CHECK: vgatherdps zmm8 {k1}, dword ptr [r14 + 8*zmm26 - 123] // CHECK: encoding: [0x62,0x12,0x7d,0x41,0x92,0x84,0xd6,0x85,0xff,0xff,0xff] - vgatherdps zmm8{k1},ZMMWORD PTR [r14+zmm26*8-0x7b] + vgatherdps zmm8{k1},DWORD PTR [r14+zmm26*8-0x7b] -// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [r9 + zmm26 + 256] +// CHECK: vgatherdps zmm8 {k1}, dword ptr [r9 + zmm26 + 256] // CHECK: encoding: [0x62,0x12,0x7d,0x41,0x92,0x44,0x11,0x40] - vgatherdps zmm8{k1},ZMMWORD PTR [r9+zmm26*1+0x100] + vgatherdps zmm8{k1},DWORD PTR [r9+zmm26*1+0x100] -// CHECK: vgatherdps zmm8 {k1}, zmmword ptr [rcx + 4*zmm26 + 1024] +// CHECK: vgatherdps zmm8 {k1}, dword ptr [rcx + 4*zmm26 + 1024] // CHECK: encoding: [0x62,0x32,0x7d,0x41,0x92,0x84,0x91,0x00,0x04,0x00,0x00] - vgatherdps zmm8{k1},ZMMWORD PTR [rcx+zmm26*4+0x400] + vgatherdps zmm8{k1},DWORD PTR [rcx+zmm26*4+0x400] -// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [r14 + 8*zmm13 - 123] +// CHECK: vgatherqpd zmm27 {k1}, qword ptr [r14 + 8*zmm13 - 123] // CHECK: encoding: [0x62,0x02,0xfd,0x49,0x93,0x9c,0xee,0x85,0xff,0xff,0xff] - vgatherqpd zmm27{k1},ZMMWORD PTR [r14+zmm13*8-0x7b] + vgatherqpd zmm27{k1},QWORD PTR [r14+zmm13*8-0x7b] -// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [r9 + zmm13 + 256] +// CHECK: vgatherqpd zmm27 {k1}, qword ptr [r9 + zmm13 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x49,0x93,0x5c,0x29,0x20] - vgatherqpd zmm27{k1},ZMMWORD PTR [r9+zmm13*1+0x100] + vgatherqpd zmm27{k1},QWORD PTR [r9+zmm13*1+0x100] -// CHECK: vgatherqpd zmm27 {k1}, zmmword ptr [rcx + 4*zmm13 + 1024] +// CHECK: vgatherqpd zmm27 {k1}, qword ptr [rcx + 4*zmm13 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x49,0x93,0x9c,0xa9,0x00,0x04,0x00,0x00] - vgatherqpd zmm27{k1},ZMMWORD PTR [rcx+zmm13*4+0x400] + vgatherqpd zmm27{k1},QWORD PTR [rcx+zmm13*4+0x400] -// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [r14 + 8*zmm14 - 123] +// CHECK: vgatherqps ymm27 {k1}, dword ptr [r14 + 8*zmm14 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x49,0x93,0x9c,0xf6,0x85,0xff,0xff,0xff] - vgatherqps ymm27{k1},YMMWORD PTR [r14+zmm14*8-0x7b] + vgatherqps ymm27{k1},DWORD PTR [r14+zmm14*8-0x7b] -// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [r9 + zmm14 + 256] +// CHECK: vgatherqps ymm27 {k1}, dword ptr [r9 + zmm14 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x49,0x93,0x5c,0x31,0x40] - vgatherqps ymm27{k1},YMMWORD PTR [r9+zmm14*1+0x100] + vgatherqps ymm27{k1},DWORD PTR [r9+zmm14*1+0x100] -// CHECK: vgatherqps ymm27 {k1}, ymmword ptr [rcx + 4*zmm14 + 1024] +// CHECK: vgatherqps ymm27 {k1}, dword ptr [rcx + 4*zmm14 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x49,0x93,0x9c,0xb1,0x00,0x04,0x00,0x00] - vgatherqps ymm27{k1},YMMWORD PTR [rcx+zmm14*4+0x400] + vgatherqps ymm27{k1},DWORD PTR [rcx+zmm14*4+0x400] -// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [r14 + 8*zmm16 - 123] +// CHECK: vpgatherdd zmm7 {k1}, dword ptr [r14 + 8*zmm16 - 123] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x90,0xbc,0xc6,0x85,0xff,0xff,0xff] - vpgatherdd zmm7{k1},ZMMWORD PTR [r14+zmm16*8-0x7b] + vpgatherdd zmm7{k1},DWORD PTR [r14+zmm16*8-0x7b] -// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [r9 + zmm16 + 256] +// CHECK: vpgatherdd zmm7 {k1}, dword ptr [r9 + zmm16 + 256] // CHECK: encoding: [0x62,0xd2,0x7d,0x41,0x90,0x7c,0x01,0x40] - vpgatherdd zmm7{k1},ZMMWORD PTR [r9+zmm16*1+0x100] + vpgatherdd zmm7{k1},DWORD PTR [r9+zmm16*1+0x100] -// CHECK: vpgatherdd zmm7 {k1}, zmmword ptr [rcx + 4*zmm16 + 1024] +// CHECK: vpgatherdd zmm7 {k1}, dword ptr [rcx + 4*zmm16 + 1024] // CHECK: encoding: [0x62,0xf2,0x7d,0x41,0x90,0xbc,0x81,0x00,0x04,0x00,0x00] - vpgatherdd zmm7{k1},ZMMWORD PTR [rcx+zmm16*4+0x400] + vpgatherdd zmm7{k1},DWORD PTR [rcx+zmm16*4+0x400] -// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [r14 + 8*ymm7 - 123] +// CHECK: vpgatherdq zmm25 {k1}, qword ptr [r14 + 8*ymm7 - 123] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x90,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpgatherdq zmm25{k1},ZMMWORD PTR [r14+ymm7*8-0x7b] + vpgatherdq zmm25{k1},QWORD PTR [r14+ymm7*8-0x7b] -// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [r9 + ymm7 + 256] +// CHECK: vpgatherdq zmm25 {k1}, qword ptr [r9 + ymm7 + 256] // CHECK: encoding: [0x62,0x42,0xfd,0x49,0x90,0x4c,0x39,0x20] - vpgatherdq zmm25{k1},ZMMWORD PTR [r9+ymm7*1+0x100] + vpgatherdq zmm25{k1},QWORD PTR [r9+ymm7*1+0x100] -// CHECK: vpgatherdq zmm25 {k1}, zmmword ptr [rcx + 4*ymm7 + 1024] +// CHECK: vpgatherdq zmm25 {k1}, qword ptr [rcx + 4*ymm7 + 1024] // CHECK: encoding: [0x62,0x62,0xfd,0x49,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq zmm25{k1},ZMMWORD PTR [rcx+ymm7*4+0x400] + vpgatherdq zmm25{k1},QWORD PTR [rcx+ymm7*4+0x400] -// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [r14 + 8*zmm17 - 123] +// CHECK: vpgatherqd ymm19 {k1}, dword ptr [r14 + 8*zmm17 - 123] // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0x91,0x9c,0xce,0x85,0xff,0xff,0xff] - vpgatherqd ymm19{k1},YMMWORD PTR [r14+zmm17*8-0x7b] + vpgatherqd ymm19{k1},DWORD PTR [r14+zmm17*8-0x7b] -// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [r9 + zmm17 + 256] +// CHECK: vpgatherqd ymm19 {k1}, dword ptr [r9 + zmm17 + 256] // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0x91,0x5c,0x09,0x40] - vpgatherqd ymm19{k1},YMMWORD PTR [r9+zmm17*1+0x100] + vpgatherqd ymm19{k1},DWORD PTR [r9+zmm17*1+0x100] -// CHECK: vpgatherqd ymm19 {k1}, ymmword ptr [rcx + 4*zmm17 + 1024] +// CHECK: vpgatherqd ymm19 {k1}, dword ptr [rcx + 4*zmm17 + 1024] // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0x91,0x9c,0x89,0x00,0x04,0x00,0x00] - vpgatherqd ymm19{k1},YMMWORD PTR [rcx+zmm17*4+0x400] + vpgatherqd ymm19{k1},DWORD PTR [rcx+zmm17*4+0x400] -// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [r14 + 8*zmm13 - 123] +// CHECK: vpgatherqq zmm10 {k1}, qword ptr [r14 + 8*zmm13 - 123] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x91,0x94,0xee,0x85,0xff,0xff,0xff] - vpgatherqq zmm10{k1},ZMMWORD PTR [r14+zmm13*8-0x7b] + vpgatherqq zmm10{k1},QWORD PTR [r14+zmm13*8-0x7b] -// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [r9 + zmm13 + 256] +// CHECK: vpgatherqq zmm10 {k1}, qword ptr [r9 + zmm13 + 256] // CHECK: encoding: [0x62,0x12,0xfd,0x49,0x91,0x54,0x29,0x20] - vpgatherqq zmm10{k1},ZMMWORD PTR [r9+zmm13*1+0x100] + vpgatherqq zmm10{k1},QWORD PTR [r9+zmm13*1+0x100] -// CHECK: vpgatherqq zmm10 {k1}, zmmword ptr [rcx + 4*zmm13 + 1024] +// CHECK: vpgatherqq zmm10 {k1}, qword ptr [rcx + 4*zmm13 + 1024] // CHECK: encoding: [0x62,0x32,0xfd,0x49,0x91,0x94,0xa9,0x00,0x04,0x00,0x00] - vpgatherqq zmm10{k1},ZMMWORD PTR [rcx+zmm13*4+0x400] + vpgatherqq zmm10{k1},QWORD PTR [rcx+zmm13*4+0x400] -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa0,0xbc,0xe6,0x85,0xff,0xff,0xff] - vpscatterdd ZMMWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 + vpscatterdd DWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 -// CHECK: vpscatterdd zmmword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [r14 + 8*zmm4 - 123] {k1}, zmm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa0,0xbc,0xe6,0x85,0xff,0xff,0xff] - vpscatterdd ZMMWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 + vpscatterdd DWORD PTR [r14+zmm4*8-0x7b]{k1},zmm23 -// CHECK: vpscatterdd zmmword ptr [r9 + zmm4 + 256] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [r9 + zmm4 + 256] {k1}, zmm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x49,0xa0,0x7c,0x21,0x40] - vpscatterdd ZMMWORD PTR [r9+zmm4*1+0x100]{k1},zmm23 + vpscatterdd DWORD PTR [r9+zmm4*1+0x100]{k1},zmm23 -// CHECK: vpscatterdd zmmword ptr [rcx + 4*zmm4 + 1024] {k1}, zmm23 +// CHECK: vpscatterdd dword ptr [rcx + 4*zmm4 + 1024] {k1}, zmm23 // CHECK: encoding: [0x62,0xe2,0x7d,0x49,0xa0,0xbc,0xa1,0x00,0x04,0x00,0x00] - vpscatterdd ZMMWORD PTR [rcx+zmm4*4+0x400]{k1},zmm23 + vpscatterdd DWORD PTR [rcx+zmm4*4+0x400]{k1},zmm23 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0x92,0xfd,0x41,0xa0,0x8c,0xce,0x85,0xff,0xff,0xff] - vpscatterdq ZMMWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 + vpscatterdq QWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 -// CHECK: vpscatterdq zmmword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [r14 + 8*ymm25 - 123] {k1}, zmm1 // CHECK: encoding: [0x62,0x92,0xfd,0x41,0xa0,0x8c,0xce,0x85,0xff,0xff,0xff] - vpscatterdq ZMMWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 + vpscatterdq QWORD PTR [r14+ymm25*8-0x7b]{k1},zmm1 -// CHECK: vpscatterdq zmmword ptr [r9 + ymm25 + 256] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [r9 + ymm25 + 256] {k1}, zmm1 // CHECK: encoding: [0x62,0x92,0xfd,0x41,0xa0,0x4c,0x09,0x20] - vpscatterdq ZMMWORD PTR [r9+ymm25*1+0x100]{k1},zmm1 + vpscatterdq QWORD PTR [r9+ymm25*1+0x100]{k1},zmm1 -// CHECK: vpscatterdq zmmword ptr [rcx + 4*ymm25 + 1024] {k1}, zmm1 +// CHECK: vpscatterdq qword ptr [rcx + 4*ymm25 + 1024] {k1}, zmm1 // CHECK: encoding: [0x62,0xb2,0xfd,0x41,0xa0,0x8c,0x89,0x00,0x04,0x00,0x00] - vpscatterdq ZMMWORD PTR [rcx+ymm25*4+0x400]{k1},zmm1 + vpscatterdq QWORD PTR [rcx+ymm25*4+0x400]{k1},zmm1 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa1,0xbc,0xf6,0x85,0xff,0xff,0xff] - vpscatterqd YMMWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 + vpscatterqd DWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 -// CHECK: vpscatterqd ymmword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [r14 + 8*zmm22 - 123] {k1}, ymm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa1,0xbc,0xf6,0x85,0xff,0xff,0xff] - vpscatterqd YMMWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 + vpscatterqd DWORD PTR [r14+zmm22*8-0x7b]{k1},ymm23 -// CHECK: vpscatterqd ymmword ptr [r9 + zmm22 + 256] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [r9 + zmm22 + 256] {k1}, ymm23 // CHECK: encoding: [0x62,0xc2,0x7d,0x41,0xa1,0x7c,0x31,0x40] - vpscatterqd YMMWORD PTR [r9+zmm22*1+0x100]{k1},ymm23 + vpscatterqd DWORD PTR [r9+zmm22*1+0x100]{k1},ymm23 -// CHECK: vpscatterqd ymmword ptr [rcx + 4*zmm22 + 1024] {k1}, ymm23 +// CHECK: vpscatterqd dword ptr [rcx + 4*zmm22 + 1024] {k1}, ymm23 // CHECK: encoding: [0x62,0xe2,0x7d,0x41,0xa1,0xbc,0xb1,0x00,0x04,0x00,0x00] - vpscatterqd YMMWORD PTR [rcx+zmm22*4+0x400]{k1},ymm23 + vpscatterqd DWORD PTR [rcx+zmm22*4+0x400]{k1},ymm23 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 // CHECK: encoding: [0x62,0x92,0xfd,0x49,0xa1,0x94,0xc6,0x85,0xff,0xff,0xff] - vpscatterqq ZMMWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 + vpscatterqq QWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 -// CHECK: vpscatterqq zmmword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [r14 + 8*zmm8 - 123] {k1}, zmm2 // CHECK: encoding: [0x62,0x92,0xfd,0x49,0xa1,0x94,0xc6,0x85,0xff,0xff,0xff] - vpscatterqq ZMMWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 + vpscatterqq QWORD PTR [r14+zmm8*8-0x7b]{k1},zmm2 -// CHECK: vpscatterqq zmmword ptr [r9 + zmm8 + 256] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [r9 + zmm8 + 256] {k1}, zmm2 // CHECK: encoding: [0x62,0x92,0xfd,0x49,0xa1,0x54,0x01,0x20] - vpscatterqq ZMMWORD PTR [r9+zmm8*1+0x100]{k1},zmm2 + vpscatterqq QWORD PTR [r9+zmm8*1+0x100]{k1},zmm2 -// CHECK: vpscatterqq zmmword ptr [rcx + 4*zmm8 + 1024] {k1}, zmm2 +// CHECK: vpscatterqq qword ptr [rcx + 4*zmm8 + 1024] {k1}, zmm2 // CHECK: encoding: [0x62,0xb2,0xfd,0x49,0xa1,0x94,0x81,0x00,0x04,0x00,0x00] - vpscatterqq ZMMWORD PTR [rcx+zmm8*4+0x400]{k1},zmm2 + vpscatterqq QWORD PTR [rcx+zmm8*4+0x400]{k1},zmm2 diff --git a/llvm/test/MC/X86/avx512f_vl-intel.s b/llvm/test/MC/X86/avx512f_vl-intel.s index 31c43afe50171..ed3292b83f4d7 100644 --- a/llvm/test/MC/X86/avx512f_vl-intel.s +++ b/llvm/test/MC/X86/avx512f_vl-intel.s @@ -224,901 +224,901 @@ // CHECK: encoding: [0x62,0xf1,0x64,0x30,0xc2,0xa2,0xfc,0xfd,0xff,0xff,0x7b] vcmpps k4,ymm19,DWORD PTR [rdx-0x204]{1to8},0x7b -// CHECK: vgatherdpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherdpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vgatherdpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vgatherdpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherdpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x4c,0x39,0x20] - vgatherdpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0x8c,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdpd ymm23 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherdpd ymm23 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vgatherdpd ymm23 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] + vgatherdpd ymm23 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherdpd ymm23 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd ymm23 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x7c,0x39,0x20] - vgatherdpd ymm23 {k1}, ymmword ptr [r9 + xmm31 + 256] + vgatherdpd ymm23 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd ymm23 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd ymm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd ymm23 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd ymm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdpd xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherdpd xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0xbc,0xfe,0x85,0xff,0xff,0xff] - vgatherdpd xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vgatherdpd xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherdpd xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd xmm23 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x92,0x7c,0x39,0x20] - vgatherdpd xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdpd xmm23 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x92,0xbc,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdpd ymm18 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherdpd ymm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x94,0xfe,0x85,0xff,0xff,0xff] - vgatherdpd ymm18 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] + vgatherdpd ymm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherdpd ymm18 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdpd ymm18 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x92,0x54,0x39,0x20] - vgatherdpd ymm18 {k1}, ymmword ptr [r9 + xmm31 + 256] + vgatherdpd ymm18 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdpd ymm18 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdpd ymm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x92,0x94,0xb9,0x00,0x04,0x00,0x00] - vgatherdpd ymm18 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vgatherdpd ymm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdps xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherdps xmm18 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x94,0xfe,0x7b,0x00,0x00,0x00] - vgatherdps xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vgatherdps xmm18 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherdps xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdps xmm18 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x92,0x54,0x39,0x40] - vgatherdps xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdps xmm18 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdps xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdps xmm18 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x92,0x94,0xb9,0x00,0x04,0x00,0x00] - vgatherdps xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdps xmm18 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdps ymm27 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vgatherdps ymm27 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vgatherdps ymm27 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vgatherdps ymm27 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vgatherdps ymm27 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherdps ymm27 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x92,0x5c,0x39,0x40] - vgatherdps ymm27 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherdps ymm27 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherdps ymm27 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherdps ymm27 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x92,0x9c,0xb9,0x00,0x04,0x00,0x00] - vgatherdps ymm27 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherdps ymm27 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherdps xmm29 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherdps xmm29 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x92,0xac,0xfe,0x85,0xff,0xff,0xff] - vgatherdps xmm29 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vgatherdps xmm29 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherdps xmm29 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherdps xmm29 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x92,0x6c,0x39,0x40] - vgatherdps xmm29 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherdps xmm29 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherdps xmm29 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherdps xmm29 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x01,0x92,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherdps xmm29 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherdps xmm29 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherdps ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vgatherdps ymm21 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x92,0xac,0xfe,0x85,0xff,0xff,0xff] - vgatherdps ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vgatherdps ymm21 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vgatherdps ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherdps ymm21 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x92,0x6c,0x39,0x40] - vgatherdps ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherdps ymm21 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherdps ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherdps ymm21 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x92,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherdps ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherdps ymm21 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherqpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vgatherqpd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vgatherqpd xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherqpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x4c,0x39,0x20] - vgatherqpd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherqpd xmm17 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x8c,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherqpd xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqpd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vgatherqpd ymm29 {k1}, qword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00] - vgatherqpd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vgatherqpd ymm29 {k1}, qword ptr [r14 + 8*ymm31 + 123] -// CHECK: vgatherqpd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqpd ymm29 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x93,0x6c,0x39,0x20] - vgatherqpd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherqpd ymm29 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqpd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqpd ymm29 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherqpd ymm29 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqpd xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherqpd xmm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x94,0xfe,0x85,0xff,0xff,0xff] - vgatherqpd xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vgatherqpd xmm18 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherqpd xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqpd xmm18 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x93,0x54,0x39,0x20] - vgatherqpd xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] + vgatherqpd xmm18 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqpd xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqpd xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x93,0x94,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vgatherqpd xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqpd ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vgatherqpd ymm21 {k1}, qword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x93,0xac,0xfe,0x85,0xff,0xff,0xff] - vgatherqpd ymm21 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vgatherqpd ymm21 {k1}, qword ptr [r14 + 8*ymm31 - 123] -// CHECK: vgatherqpd ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqpd ymm21 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x93,0x6c,0x39,0x20] - vgatherqpd ymm21 {k1}, ymmword ptr [r9 + ymm31 + 256] + vgatherqpd ymm21 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqpd ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqpd ymm21 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherqpd ymm21 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vgatherqpd ymm21 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqps xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] +// CHECK: vgatherqps xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xac,0xfe,0x7b,0x00,0x00,0x00] - vgatherqps xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] + vgatherqps xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vgatherqps xmm21 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqps xmm21 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x6c,0x39,0x40] - vgatherqps xmm21 {k1}, qword ptr [r9 + xmm31 + 256] + vgatherqps xmm21 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqps xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqps xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xac,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vgatherqps xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqps xmm19 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vgatherqps xmm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vgatherqps xmm19 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] + vgatherqps xmm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vgatherqps xmm19 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqps xmm19 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x93,0x5c,0x39,0x40] - vgatherqps xmm19 {k1}, xmmword ptr [r9 + ymm31 + 256] + vgatherqps xmm19 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqps xmm19 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqps xmm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x93,0x9c,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm19 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vgatherqps xmm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vgatherqps xmm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] +// CHECK: vgatherqps xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0xb4,0xfe,0x85,0xff,0xff,0xff] - vgatherqps xmm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] + vgatherqps xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vgatherqps xmm22 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vgatherqps xmm22 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x93,0x74,0x39,0x40] - vgatherqps xmm22 {k1}, qword ptr [r9 + xmm31 + 256] + vgatherqps xmm22 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vgatherqps xmm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vgatherqps xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x93,0xb4,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vgatherqps xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vgatherqps xmm30 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vgatherqps xmm30 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x93,0xb4,0xfe,0x85,0xff,0xff,0xff] - vgatherqps xmm30 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] + vgatherqps xmm30 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vgatherqps xmm30 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vgatherqps xmm30 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x93,0x74,0x39,0x40] - vgatherqps xmm30 {k1}, xmmword ptr [r9 + ymm31 + 256] + vgatherqps xmm30 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vgatherqps xmm30 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vgatherqps xmm30 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x93,0xb4,0xb9,0x00,0x04,0x00,0x00] - vgatherqps xmm30 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vgatherqps xmm30 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherdd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherdd xmm17 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdd xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vpgatherdd xmm17 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherdd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdd xmm17 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x4c,0x39,0x40] - vpgatherdd xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdd xmm17 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdd xmm17 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdd xmm17 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdd ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vpgatherdd ymm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdd ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vpgatherdd ymm19 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vpgatherdd ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherdd ymm19 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x21,0x90,0x5c,0x39,0x40] - vpgatherdd ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherdd ymm19 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherdd ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherdd ymm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0x90,0x9c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherdd ymm19 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherdd xmm22 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherdd xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpgatherdd xmm22 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vpgatherdd xmm22 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherdd xmm22 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdd xmm22 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x90,0x74,0x39,0x40] - vpgatherdd xmm22 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdd xmm22 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdd xmm22 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdd xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x90,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd xmm22 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdd xmm22 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vpgatherdd ymm29 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x90,0xac,0xfe,0x85,0xff,0xff,0xff] - vpgatherdd ymm29 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vpgatherdd ymm29 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vpgatherdd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherdd ymm29 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x90,0x6c,0x39,0x40] - vpgatherdd ymm29 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherdd ymm29 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherdd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherdd ymm29 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x90,0xac,0xb9,0x00,0x04,0x00,0x00] - vpgatherdd ymm29 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherdd ymm29 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherdq xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherdq xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdq xmm17 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vpgatherdq xmm17 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherdq xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq xmm17 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x90,0x4c,0x39,0x20] - vpgatherdq xmm17 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdq xmm17 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq xmm17 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq xmm17 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdq ymm26 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherdq ymm26 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x94,0xfe,0x7b,0x00,0x00,0x00] - vpgatherdq ymm26 {k1}, ymmword ptr [r14 + 8*xmm31 + 123] + vpgatherdq ymm26 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherdq ymm26 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq ymm26 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x90,0x54,0x39,0x20] - vpgatherdq ymm26 {k1}, ymmword ptr [r9 + xmm31 + 256] + vpgatherdq ymm26 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq ymm26 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq ymm26 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x90,0x94,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq ymm26 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq ymm26 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdq xmm25 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherdq xmm25 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x02,0xfd,0x01,0x90,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpgatherdq xmm25 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vpgatherdq xmm25 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherdq xmm25 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq xmm25 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x01,0x90,0x4c,0x39,0x20] - vpgatherdq xmm25 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherdq xmm25 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq xmm25 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq xmm25 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x01,0x90,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq xmm25 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq xmm25 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherdq ymm22 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherdq ymm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x90,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpgatherdq ymm22 {k1}, ymmword ptr [r14 + 8*xmm31 - 123] + vpgatherdq ymm22 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherdq ymm22 {k1}, ymmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherdq ymm22 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x90,0x74,0x39,0x20] - vpgatherdq ymm22 {k1}, ymmword ptr [r9 + xmm31 + 256] + vpgatherdq ymm22 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherdq ymm22 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherdq ymm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x90,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpgatherdq ymm22 {k1}, ymmword ptr [rcx + 4*xmm31 + 1024] + vpgatherdq ymm22 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqd xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherqd xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0xac,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqd xmm21 {k1}, qword ptr [r14 + 8*xmm31 + 123] + vpgatherqd xmm21 {k1}, dword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherqd xmm21 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqd xmm21 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0x7d,0x01,0x91,0x6c,0x39,0x40] - vpgatherqd xmm21 {k1}, qword ptr [r9 + xmm31 + 256] + vpgatherqd xmm21 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqd xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqd xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0x91,0xac,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm21 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vpgatherqd xmm21 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqd xmm25 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vpgatherqd xmm25 {k1}, dword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqd xmm25 {k1}, xmmword ptr [r14 + 8*ymm31 + 123] + vpgatherqd xmm25 {k1}, dword ptr [r14 + 8*ymm31 + 123] -// CHECK: vpgatherqd xmm25 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqd xmm25 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x4c,0x39,0x40] - vpgatherqd xmm25 {k1}, xmmword ptr [r9 + ymm31 + 256] + vpgatherqd xmm25 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqd xmm25 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqd xmm25 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm25 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqd xmm25 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherqd xmm30 {k1}, qword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherqd xmm30 {k1}, dword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x91,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpgatherqd xmm30 {k1}, qword ptr [r14 + 8*xmm31 - 123] + vpgatherqd xmm30 {k1}, dword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherqd xmm30 {k1}, qword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqd xmm30 {k1}, dword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x01,0x91,0x74,0x39,0x40] - vpgatherqd xmm30 {k1}, qword ptr [r9 + xmm31 + 256] + vpgatherqd xmm30 {k1}, dword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqd xmm30 {k1}, qword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqd xmm30 {k1}, dword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x01,0x91,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm30 {k1}, qword ptr [rcx + 4*xmm31 + 1024] + vpgatherqd xmm30 {k1}, dword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqd xmm28 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vpgatherqd xmm28 {k1}, dword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpgatherqd xmm28 {k1}, xmmword ptr [r14 + 8*ymm31 - 123] + vpgatherqd xmm28 {k1}, dword ptr [r14 + 8*ymm31 - 123] -// CHECK: vpgatherqd xmm28 {k1}, xmmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqd xmm28 {k1}, dword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0x7d,0x21,0x91,0x64,0x39,0x40] - vpgatherqd xmm28 {k1}, xmmword ptr [r9 + ymm31 + 256] + vpgatherqd xmm28 {k1}, dword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqd xmm28 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqd xmm28 {k1}, dword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0x7d,0x21,0x91,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpgatherqd xmm28 {k1}, xmmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqd xmm28 {k1}, dword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherqq xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] +// CHECK: vpgatherqq xmm18 {k1}, qword ptr [r14 + 8*xmm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x94,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqq xmm18 {k1}, xmmword ptr [r14 + 8*xmm31 + 123] + vpgatherqq xmm18 {k1}, qword ptr [r14 + 8*xmm31 + 123] -// CHECK: vpgatherqq xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqq xmm18 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x54,0x39,0x20] - vpgatherqq xmm18 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherqq xmm18 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqq xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqq xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0x94,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq xmm18 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherqq xmm18 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqq ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] +// CHECK: vpgatherqq ymm19 {k1}, qword ptr [r14 + 8*ymm31 + 123] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpgatherqq ymm19 {k1}, ymmword ptr [r14 + 8*ymm31 + 123] + vpgatherqq ymm19 {k1}, qword ptr [r14 + 8*ymm31 + 123] -// CHECK: vpgatherqq ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqq ymm19 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x21,0x91,0x5c,0x39,0x20] - vpgatherqq ymm19 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherqq ymm19 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqq ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqq ymm19 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0x91,0x9c,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq ymm19 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqq ymm19 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpgatherqq xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] +// CHECK: vpgatherqq xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0xbc,0xfe,0x85,0xff,0xff,0xff] - vpgatherqq xmm23 {k1}, xmmword ptr [r14 + 8*xmm31 - 123] + vpgatherqq xmm23 {k1}, qword ptr [r14 + 8*xmm31 - 123] -// CHECK: vpgatherqq xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] +// CHECK: vpgatherqq xmm23 {k1}, qword ptr [r9 + xmm31 + 256] // CHECK: encoding: [0x62,0x82,0xfd,0x01,0x91,0x7c,0x39,0x20] - vpgatherqq xmm23 {k1}, xmmword ptr [r9 + xmm31 + 256] + vpgatherqq xmm23 {k1}, qword ptr [r9 + xmm31 + 256] -// CHECK: vpgatherqq xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] +// CHECK: vpgatherqq xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0x91,0xbc,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq xmm23 {k1}, xmmword ptr [rcx + 4*xmm31 + 1024] + vpgatherqq xmm23 {k1}, qword ptr [rcx + 4*xmm31 + 1024] -// CHECK: vpgatherqq ymm26 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] +// CHECK: vpgatherqq ymm26 {k1}, qword ptr [r14 + 8*ymm31 - 123] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x91,0x94,0xfe,0x85,0xff,0xff,0xff] - vpgatherqq ymm26 {k1}, ymmword ptr [r14 + 8*ymm31 - 123] + vpgatherqq ymm26 {k1}, qword ptr [r14 + 8*ymm31 - 123] -// CHECK: vpgatherqq ymm26 {k1}, ymmword ptr [r9 + ymm31 + 256] +// CHECK: vpgatherqq ymm26 {k1}, qword ptr [r9 + ymm31 + 256] // CHECK: encoding: [0x62,0x02,0xfd,0x21,0x91,0x54,0x39,0x20] - vpgatherqq ymm26 {k1}, ymmword ptr [r9 + ymm31 + 256] + vpgatherqq ymm26 {k1}, qword ptr [r9 + ymm31 + 256] -// CHECK: vpgatherqq ymm26 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] +// CHECK: vpgatherqq ymm26 {k1}, qword ptr [rcx + 4*ymm31 + 1024] // CHECK: encoding: [0x62,0x22,0xfd,0x21,0x91,0x94,0xb9,0x00,0x04,0x00,0x00] - vpgatherqq ymm26 {k1}, ymmword ptr [rcx + 4*ymm31 + 1024] + vpgatherqq ymm26 {k1}, qword ptr [rcx + 4*ymm31 + 1024] -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 + vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 + vpscatterdd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm20 -// CHECK: vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm20 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x64,0x39,0x40] - vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm20 + vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm20 -// CHECK: vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 +// CHECK: vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 + vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm20 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 + vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 + vpscatterdd dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x64,0x39,0x40] - vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm28 + vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm28 -// CHECK: vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 +// CHECK: vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 + vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm28 -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 + vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 -// CHECK: vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 + vpscatterdd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm17 -// CHECK: vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm17 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa0,0x4c,0x39,0x40] - vpscatterdd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm17 + vpscatterdd dword ptr [r9 + xmm31 + 256] {k1}, xmm17 -// CHECK: vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 +// CHECK: vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa0,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 + vpscatterdd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm17 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 + vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 -// CHECK: vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterdd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 + vpscatterdd dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm24 -// CHECK: vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa0,0x44,0x39,0x40] - vpscatterdd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm24 + vpscatterdd dword ptr [r9 + ymm31 + 256] {k1}, ymm24 -// CHECK: vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 +// CHECK: vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa0,0x84,0xb9,0x00,0x04,0x00,0x00] - vpscatterdd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 + vpscatterdd dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm24 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0xac,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0xac,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa0,0x6c,0x39,0x20] - vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm21 -// CHECK: vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa0,0xac,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterdq ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 + vpscatterdq qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm28 -// CHECK: vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm28 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa0,0x64,0x39,0x20] - vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm28 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm28 -// CHECK: vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm28 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa0,0x64,0x39,0x20] - vpscatterdq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 -// CHECK: vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0xa4,0xfe,0x85,0xff,0xff,0xff] - vpscatterdq ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 + vpscatterdq qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm20 -// CHECK: vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm20 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa0,0x64,0x39,0x20] - vpscatterdq ymmword ptr [r9 + xmm31 + 256] {k1}, ymm20 + vpscatterdq qword ptr [r9 + xmm31 + 256] {k1}, ymm20 -// CHECK: vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 +// CHECK: vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa0,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterdq ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 + vpscatterdq qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm20 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0x74,0x39,0x40] - vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 + vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa1,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 + vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x84,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 + vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x84,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqd xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 + vpscatterqd dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm24 -// CHECK: vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x44,0x39,0x40] - vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm24 + vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm24 -// CHECK: vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 +// CHECK: vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa1,0x84,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 + vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm24 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0xb4,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 + vpscatterqd dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 // CHECK: encoding: [0x62,0x82,0x7d,0x01,0xa1,0x74,0x39,0x40] - vpscatterqd qword ptr [r9 + xmm31 + 256] {k1}, xmm22 + vpscatterqd dword ptr [r9 + xmm31 + 256] {k1}, xmm22 -// CHECK: vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 +// CHECK: vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 // CHECK: encoding: [0x62,0xa2,0x7d,0x01,0xa1,0xb4,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 + vpscatterqd dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm22 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0xac,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 + vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 -// CHECK: vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0xac,0xfe,0x85,0xff,0xff,0xff] - vpscatterqd xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 + vpscatterqd dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm29 -// CHECK: vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm29 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa1,0x6c,0x39,0x40] - vpscatterqd xmmword ptr [r9 + ymm31 + 256] {k1}, xmm29 + vpscatterqd dword ptr [r9 + ymm31 + 256] {k1}, xmm29 -// CHECK: vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 +// CHECK: vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa1,0xac,0xb9,0x00,0x04,0x00,0x00] - vpscatterqd xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 + vpscatterqd dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm29 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vpscatterqq qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x64,0x39,0x20] - vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa1,0xa4,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 + vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x9c,0xfe,0x7b,0x00,0x00,0x00] - vpscatterqq ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 + vpscatterqq qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm19 -// CHECK: vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm19 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x5c,0x39,0x20] - vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm19 + vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm19 -// CHECK: vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 +// CHECK: vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa1,0x9c,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 + vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm19 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 + vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 -// CHECK: vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x84,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 + vpscatterqq qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm24 -// CHECK: vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0xfd,0x01,0xa1,0x44,0x39,0x20] - vpscatterqq xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 + vpscatterqq qword ptr [r9 + xmm31 + 256] {k1}, xmm24 -// CHECK: vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 +// CHECK: vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 // CHECK: encoding: [0x62,0x22,0xfd,0x01,0xa1,0x84,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 + vpscatterqq qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 + vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 -// CHECK: vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x8c,0xfe,0x85,0xff,0xff,0xff] - vpscatterqq ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 + vpscatterqq qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm17 -// CHECK: vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm17 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa1,0x4c,0x39,0x20] - vpscatterqq ymmword ptr [r9 + ymm31 + 256] {k1}, ymm17 + vpscatterqq qword ptr [r9 + ymm31 + 256] {k1}, ymm17 -// CHECK: vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 +// CHECK: vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa1,0x8c,0xb9,0x00,0x04,0x00,0x00] - vpscatterqq ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 + vpscatterqq qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm17 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x94,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x94,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm18 -// CHECK: vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm18 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x54,0x39,0x20] - vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm18 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm18 -// CHECK: vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa2,0x94,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm18 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0xb4,0xfe,0x7b,0x00,0x00,0x00] - vscatterdpd ymmword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 + vscatterdpd qword ptr [r14 + 8*xmm31 + 123] {k1}, ymm30 -// CHECK: vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm30 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x74,0x39,0x20] - vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm30 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm30 -// CHECK: vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa2,0xb4,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm30 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa2,0x5c,0x39,0x20] - vscatterdpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 -// CHECK: vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa2,0x9c,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x94,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 -// CHECK: vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x94,0xfe,0x85,0xff,0xff,0xff] - vscatterdpd ymmword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 + vscatterdpd qword ptr [r14 + 8*xmm31 - 123] {k1}, ymm26 -// CHECK: vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm26 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa2,0x54,0x39,0x20] - vscatterdpd ymmword ptr [r9 + xmm31 + 256] {k1}, ymm26 + vscatterdpd qword ptr [r9 + xmm31 + 256] {k1}, ymm26 -// CHECK: vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 +// CHECK: vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa2,0x94,0xb9,0x00,0x04,0x00,0x00] - vscatterdpd ymmword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 + vscatterdpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, ymm26 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x84,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 + vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x84,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 + vscatterdps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm24 -// CHECK: vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm24 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x44,0x39,0x40] - vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm24 + vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm24 -// CHECK: vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 +// CHECK: vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa2,0x84,0xb9,0x00,0x04,0x00,0x00] - vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 + vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm24 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterdps ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterdps dword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa2,0x7c,0x39,0x40] - vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 + vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm23 -// CHECK: vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 +// CHECK: vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0xa2,0xbc,0xb9,0x00,0x04,0x00,0x00] - vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 + vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0xa4,0xfe,0x85,0xff,0xff,0xff] - vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0xa4,0xfe,0x85,0xff,0xff,0xff] - vscatterdps xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 + vscatterdps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm28 -// CHECK: vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa2,0x64,0x39,0x40] - vscatterdps xmmword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vscatterdps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa2,0xa4,0xb9,0x00,0x04,0x00,0x00] - vscatterdps xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vscatterdps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x8c,0xfe,0x85,0xff,0xff,0xff] - vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 + vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 -// CHECK: vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x8c,0xfe,0x85,0xff,0xff,0xff] - vscatterdps ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 + vscatterdps dword ptr [r14 + 8*ymm31 - 123] {k1}, ymm25 -// CHECK: vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa2,0x4c,0x39,0x40] - vscatterdps ymmword ptr [r9 + ymm31 + 256] {k1}, ymm25 + vscatterdps dword ptr [r9 + ymm31 + 256] {k1}, ymm25 -// CHECK: vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 +// CHECK: vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa2,0x8c,0xb9,0x00,0x04,0x00,0x00] - vscatterdps ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 + vscatterdps dword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm25 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0xac,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0xac,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd xmmword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 + vscatterqpd qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm21 -// CHECK: vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm21 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x6c,0x39,0x20] - vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm21 + vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm21 -// CHECK: vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 +// CHECK: vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa3,0xac,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 + vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm21 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0xbc,0xfe,0x7b,0x00,0x00,0x00] - vscatterqpd ymmword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 + vscatterqpd qword ptr [r14 + 8*ymm31 + 123] {k1}, ymm23 -// CHECK: vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm23 // CHECK: encoding: [0x62,0x82,0xfd,0x21,0xa3,0x7c,0x39,0x20] - vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm23 + vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm23 -// CHECK: vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 +// CHECK: vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 // CHECK: encoding: [0x62,0xa2,0xfd,0x21,0xa3,0xbc,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 + vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm23 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd xmmword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 + vscatterqpd qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm19 -// CHECK: vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 // CHECK: encoding: [0x62,0x82,0xfd,0x01,0xa3,0x5c,0x39,0x20] - vscatterqpd xmmword ptr [r9 + xmm31 + 256] {k1}, xmm19 + vscatterqpd qword ptr [r9 + xmm31 + 256] {k1}, xmm19 -// CHECK: vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 +// CHECK: vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 // CHECK: encoding: [0x62,0xa2,0xfd,0x01,0xa3,0x9c,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd xmmword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 + vscatterqpd qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm19 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0xac,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 + vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 -// CHECK: vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0xac,0xfe,0x85,0xff,0xff,0xff] - vscatterqpd ymmword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 + vscatterqpd qword ptr [r14 + 8*ymm31 - 123] {k1}, ymm29 -// CHECK: vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm29 // CHECK: encoding: [0x62,0x02,0xfd,0x21,0xa3,0x6c,0x39,0x20] - vscatterqpd ymmword ptr [r9 + ymm31 + 256] {k1}, ymm29 + vscatterqpd qword ptr [r9 + ymm31 + 256] {k1}, ymm29 -// CHECK: vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 +// CHECK: vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 // CHECK: encoding: [0x62,0x22,0xfd,0x21,0xa3,0xac,0xb9,0x00,0x04,0x00,0x00] - vscatterqpd ymmword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 + vscatterqpd qword ptr [rcx + 4*ymm31 + 1024] {k1}, ymm29 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0xa4,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps qword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 + vscatterqps dword ptr [r14 + 8*xmm31 + 123] {k1}, xmm28 -// CHECK: vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x64,0x39,0x40] - vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm28 + vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm28 -// CHECK: vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 +// CHECK: vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa3,0xa4,0xb9,0x00,0x04,0x00,0x00] - vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 + vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm28 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 + vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x8c,0xfe,0x7b,0x00,0x00,0x00] - vscatterqps xmmword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 + vscatterqps dword ptr [r14 + 8*ymm31 + 123] {k1}, xmm25 -// CHECK: vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm25 // CHECK: encoding: [0x62,0x02,0x7d,0x21,0xa3,0x4c,0x39,0x40] - vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm25 + vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm25 -// CHECK: vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 +// CHECK: vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 // CHECK: encoding: [0x62,0x22,0x7d,0x21,0xa3,0x8c,0xb9,0x00,0x04,0x00,0x00] - vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 + vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm25 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 + vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 -// CHECK: vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x9c,0xfe,0x85,0xff,0xff,0xff] - vscatterqps qword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 + vscatterqps dword ptr [r14 + 8*xmm31 - 123] {k1}, xmm27 -// CHECK: vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm27 // CHECK: encoding: [0x62,0x02,0x7d,0x01,0xa3,0x5c,0x39,0x40] - vscatterqps qword ptr [r9 + xmm31 + 256] {k1}, xmm27 + vscatterqps dword ptr [r9 + xmm31 + 256] {k1}, xmm27 -// CHECK: vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 +// CHECK: vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 // CHECK: encoding: [0x62,0x22,0x7d,0x01,0xa3,0x9c,0xb9,0x00,0x04,0x00,0x00] - vscatterqps qword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 + vscatterqps dword ptr [rcx + 4*xmm31 + 1024] {k1}, xmm27 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0xbc,0xfe,0x85,0xff,0xff,0xff] - vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 + vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 -// CHECK: vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0xbc,0xfe,0x85,0xff,0xff,0xff] - vscatterqps xmmword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 + vscatterqps dword ptr [r14 + 8*ymm31 - 123] {k1}, xmm23 -// CHECK: vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm23 // CHECK: encoding: [0x62,0x82,0x7d,0x21,0xa3,0x7c,0x39,0x40] - vscatterqps xmmword ptr [r9 + ymm31 + 256] {k1}, xmm23 + vscatterqps dword ptr [r9 + ymm31 + 256] {k1}, xmm23 -// CHECK: vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 +// CHECK: vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 // CHECK: encoding: [0x62,0xa2,0x7d,0x21,0xa3,0xbc,0xb9,0x00,0x04,0x00,0x00] - vscatterqps xmmword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 + vscatterqps dword ptr [rcx + 4*ymm31 + 1024] {k1}, xmm23 // CHECK: vcvtpd2ps xmm0, xmm23 // CHECK: encoding: [0x62,0xb1,0xfd,0x08,0x5a,0xc7] diff --git a/llvm/test/MC/X86/intel-syntax.s b/llvm/test/MC/X86/intel-syntax.s index 2b365699eec7b..c622832d24bea 100644 --- a/llvm/test/MC/X86/intel-syntax.s +++ b/llvm/test/MC/X86/intel-syntax.s @@ -144,7 +144,7 @@ main: // CHECK: vshufpd $1, %xmm2, %xmm1, %xmm0 vshufpd XMM0, XMM1, XMM2, 1 // CHECK: vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm1 - vpgatherdd XMM10, XMMWORD PTR [R15 + 2*XMM9], XMM8 + vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8 // CHECK: movsd -8, %xmm5 movsd XMM5, QWORD PTR [-8] // CHECK: movsl (%rsi), %es:(%rdi) diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index c6cd3da13646a..607a6bd27c21f 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -1147,19 +1147,16 @@ OperandType RecognizableInstr::typeFromString(const std::string &s, TYPE("VK4Pair", TYPE_VK_PAIR) TYPE("VK8Pair", TYPE_VK_PAIR) TYPE("VK16Pair", TYPE_VK_PAIR) + TYPE("vx32mem", TYPE_MVSIBX) TYPE("vx64mem", TYPE_MVSIBX) - TYPE("vx128mem", TYPE_MVSIBX) - TYPE("vx256mem", TYPE_MVSIBX) - TYPE("vy128mem", TYPE_MVSIBY) - TYPE("vy256mem", TYPE_MVSIBY) + TYPE("vy32mem", TYPE_MVSIBY) + TYPE("vy64mem", TYPE_MVSIBY) + TYPE("vx32xmem", TYPE_MVSIBX) TYPE("vx64xmem", TYPE_MVSIBX) - TYPE("vx128xmem", TYPE_MVSIBX) - TYPE("vx256xmem", TYPE_MVSIBX) - TYPE("vy128xmem", TYPE_MVSIBY) - TYPE("vy256xmem", TYPE_MVSIBY) - TYPE("vy512xmem", TYPE_MVSIBY) - TYPE("vz256mem", TYPE_MVSIBZ) - TYPE("vz512mem", TYPE_MVSIBZ) + TYPE("vy32xmem", TYPE_MVSIBY) + TYPE("vy64xmem", TYPE_MVSIBY) + TYPE("vz32mem", TYPE_MVSIBZ) + TYPE("vz64mem", TYPE_MVSIBZ) TYPE("BNDR", TYPE_BNDR) TYPE("TILE", TYPE_TMM) TYPE("TILEPair", TYPE_TMM_PAIR) @@ -1372,19 +1369,16 @@ RecognizableInstr::memoryEncodingFromString(const std::string &s, ENCODING("anymem", ENCODING_RM) ENCODING("opaquemem", ENCODING_RM) ENCODING("sibmem", ENCODING_SIB) + ENCODING("vx32mem", ENCODING_VSIB) ENCODING("vx64mem", ENCODING_VSIB) - ENCODING("vx128mem", ENCODING_VSIB) - ENCODING("vx256mem", ENCODING_VSIB) - ENCODING("vy128mem", ENCODING_VSIB) - ENCODING("vy256mem", ENCODING_VSIB) + ENCODING("vy32mem", ENCODING_VSIB) + ENCODING("vy64mem", ENCODING_VSIB) + ENCODING("vx32xmem", ENCODING_VSIB) ENCODING("vx64xmem", ENCODING_VSIB) - ENCODING("vx128xmem", ENCODING_VSIB) - ENCODING("vx256xmem", ENCODING_VSIB) - ENCODING("vy128xmem", ENCODING_VSIB) - ENCODING("vy256xmem", ENCODING_VSIB) - ENCODING("vy512xmem", ENCODING_VSIB) - ENCODING("vz256mem", ENCODING_VSIB) - ENCODING("vz512mem", ENCODING_VSIB) + ENCODING("vy32xmem", ENCODING_VSIB) + ENCODING("vy64xmem", ENCODING_VSIB) + ENCODING("vz32mem", ENCODING_VSIB) + ENCODING("vz64mem", ENCODING_VSIB) errs() << "Unhandled memory encoding " << s << "\n"; llvm_unreachable("Unhandled memory encoding"); } From a0db95c1545314c8c91ea15b4238bcdc159e0ab5 Mon Sep 17 00:00:00 2001 From: William Moses Date: Mon, 13 Jan 2025 12:37:16 -0600 Subject: [PATCH 093/102] [MLIR][LLVM] Fix inlining of a single block ending with unreachable (#122646) alternate option to https://github.com/llvm/llvm-project/pull/122615 --- mlir/include/mlir/Transforms/InliningUtils.h | 10 ++++++++++ .../LLVMIR/Transforms/InlinerInterfaceImpl.cpp | 8 ++++++++ mlir/lib/Transforms/Utils/InliningUtils.cpp | 16 +++++++++++++++- mlir/test/Dialect/LLVMIR/inlining.mlir | 16 ++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Transforms/InliningUtils.h b/mlir/include/mlir/Transforms/InliningUtils.h index 88fc033a6ab7b..becfe9b047ef4 100644 --- a/mlir/include/mlir/Transforms/InliningUtils.h +++ b/mlir/include/mlir/Transforms/InliningUtils.h @@ -176,6 +176,13 @@ class DialectInlinerInterface /// is invoked before inlined terminator operations have been processed. virtual void processInlinedCallBlocks( Operation *call, iterator_range inlinedBlocks) const {} + + /// Returns true if the inliner can assume a fast path of not creating a new + /// block, if there is only one block. + virtual bool allowSingleBlockOptimization( + iterator_range inlinedBlocks) const { + return true; + } }; /// This interface provides the hooks into the inlining interface. @@ -223,6 +230,9 @@ class InlinerInterface virtual void processInlinedCallBlocks( Operation *call, iterator_range inlinedBlocks) const; + + virtual bool allowSingleBlockOptimization( + iterator_range inlinedBlocks) const; }; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp index 233cadebeec02..79dd3e3069648 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp @@ -744,6 +744,14 @@ struct LLVMInlinerInterface : public DialectInlinerInterface { op->erase(); } + bool allowSingleBlockOptimization( + iterator_range inlinedBlocks) const final { + if (!inlinedBlocks.empty() && + isa(inlinedBlocks.begin()->getTerminator())) + return false; + return true; + } + /// Handle the given inlined return by replacing the uses of the call with the /// operands of the return. This overload is called when the inlined region /// only contains one block. diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp index 0db097d14cd3c..0cae63c58ca7b 100644 --- a/mlir/lib/Transforms/Utils/InliningUtils.cpp +++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp @@ -118,6 +118,18 @@ void InlinerInterface::handleTerminator(Operation *op, handler->handleTerminator(op, valuesToRepl); } +/// Returns true if the inliner can assume a fast path of not creating a +/// new block, if there is only one block. +bool InlinerInterface::allowSingleBlockOptimization( + iterator_range inlinedBlocks) const { + if (inlinedBlocks.empty()) { + return true; + } + auto *handler = getInterfaceFor(inlinedBlocks.begin()->getParentOp()); + assert(handler && "expected valid dialect handler"); + return handler->allowSingleBlockOptimization(inlinedBlocks); +} + Value InlinerInterface::handleArgument(OpBuilder &builder, Operation *call, Operation *callable, Value argument, DictionaryAttr argumentAttrs) const { @@ -294,8 +306,10 @@ inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, interface.processInlinedCallBlocks(call, newBlocks); interface.processInlinedBlocks(newBlocks); + bool singleBlockFastPath = interface.allowSingleBlockOptimization(newBlocks); + // Handle the case where only a single block was inlined. - if (std::next(newBlocks.begin()) == newBlocks.end()) { + if (singleBlockFastPath && std::next(newBlocks.begin()) == newBlocks.end()) { // Run the result attribute handler on the terminator operands. Operation *firstBlockTerminator = firstNewBlock->getTerminator(); builder.setInsertionPoint(firstBlockTerminator); diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir index edaac4da0b044..eb249a4771753 100644 --- a/mlir/test/Dialect/LLVMIR/inlining.mlir +++ b/mlir/test/Dialect/LLVMIR/inlining.mlir @@ -676,3 +676,19 @@ llvm.func @caller(%x : i32) -> i32 { %z = llvm.call @private_func(%x) : (i32) -> (i32) llvm.return %z : i32 } + +// ----- + +llvm.func @unreachable_func(%a : i32) -> i32 { + "llvm.intr.trap"() : () -> () + llvm.unreachable +} + +// CHECK-LABEL: func @caller +llvm.func @caller(%x : i32) -> i32 { + // CHECK-NOT: llvm.call @unreachable_func + // CHECK: llvm.intr.trap + // CHECK: llvm.unreachable + %z = llvm.call @unreachable_func(%x) : (i32) -> (i32) + llvm.return %z : i32 +} From 72489eec265e55dd7bea516ad187b52f4ae71daa Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 13 Jan 2025 18:53:07 +0000 Subject: [PATCH 094/102] [AArch64] Implement FP8 SVE/SME reinterpret intrinsics (#121063) --- .../acle_sve2_fp8_reinterpret.c | 3182 +++++++++++++++++ clang/utils/TableGen/SveEmitter.cpp | 5 +- 2 files changed, 3185 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c new file mode 100644 index 0000000000000..7c70bcf6b4d66 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_reinterpret.c @@ -0,0 +1,3182 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#ifdef __ARM_FEATURE_SME +#include +#else +#include +#endif + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1, A2) A1##A2 +#endif + +#ifdef __ARM_FEATURE_SME +#define STREAMING __arm_streaming +#else +#define STREAMING +#endif + +// CHECK-LABEL: define dso_local @test_svreinterpret_s8_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_s8_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svint8_t test_svreinterpret_s8_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u8_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_u8_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svuint8_t test_svreinterpret_u8_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_mf8_s8u10__SVInt8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svmfloat8_t test_svreinterpret_mf8_s8(svint8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z25test_svreinterpret_mf8_u8u11__SVUint8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svmfloat8_t test_svreinterpret_mf8_u8(svuint8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret [[OP]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret [[OP]] +// +svmfloat8_t test_svreinterpret_mf8_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_s16u11__SVInt16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_s16(svint16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_u16u12__SVUint16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_u16(svuint16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s32( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_s32u11__SVInt32_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_s32(svint32_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u32( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_u32u12__SVUint32_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_u32(svuint32_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_s64( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_s64u11__SVInt64_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_s64(svint64_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_u64( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_u64u12__SVUint64_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_u64(svuint64_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_f16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_f16u13__SVFloat16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_f16(svfloat16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_bf16( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z27test_svreinterpret_mf8_bf16u14__SVBfloat16_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_bf16(svbfloat16_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_f32( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_f32u13__SVFloat32_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_f32(svfloat32_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_mf8_f64( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_mf8_f64u13__SVFloat64_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svmfloat8_t test_svreinterpret_mf8_f64(svfloat64_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_s16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_s16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svint16_t test_svreinterpret_s16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_u16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svuint16_t test_svreinterpret_u16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_s32_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_s32_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svint32_t test_svreinterpret_s32_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u32_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_u32_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svuint32_t test_svreinterpret_u32_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_s64_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_s64_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svint64_t test_svreinterpret_s64_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_u64_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_u64_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svuint64_t test_svreinterpret_u64_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_f16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_f16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svfloat16_t test_svreinterpret_f16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_bf16_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z27test_svreinterpret_bf16_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svbfloat16_t test_svreinterpret_bf16_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_f32_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_f32_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svfloat32_t test_svreinterpret_f32_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8)(op); +} + +// CHECK-LABEL: define dso_local @test_svreinterpret_f64_mf8( +// CHECK-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-NEXT: ret [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local @_Z26test_svreinterpret_f64_mf8u13__SVMfloat8_t( +// CHECK-CXX-SAME: [[OP:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast [[OP]] to +// CHECK-CXX-NEXT: ret [[TMP0]] +// +svfloat64_t test_svreinterpret_f64_mf8(svmfloat8_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s8_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_s8_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svint8x2_t test_svreinterpret_s8_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u8_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_u8_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svuint8x2_t test_svreinterpret_u8_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_mf8_s8_x210svint8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svmfloat8x2_t test_svreinterpret_mf8_s8_x2(svint8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z28test_svreinterpret_mf8_u8_x211svuint8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svmfloat8x2_t test_svreinterpret_mf8_u8_x2(svuint8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-NEXT: ret { , } [[TMP5]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , } poison, [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , } [[TMP3]], [[TMP4]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP5]] +// +svmfloat8x2_t test_svreinterpret_mf8_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x2)(op); +} + +// +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_s16_x211svint16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_s16_x2(svint16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_u16_x212svuint16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_u16_x2(svuint16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s32_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_s32_x211svint32x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_s32_x2(svint32x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u32_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_u32_x212svuint32x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_u32_x2(svuint32x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_s64_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_s64_x211svint64x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_s64_x2(svint64x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_u64_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_u64_x212svuint64x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_u64_x2(svuint64x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_f16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_f16_x213svfloat16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_f16_x2(svfloat16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_bf16_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z30test_svreinterpret_mf8_bf16_x214svbfloat16x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_bf16_x2(svbfloat16x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_f32_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_f32_x213svfloat32x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_f32_x2(svfloat32x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_mf8_f64_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_mf8_f64_x213svfloat64x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svmfloat8x2_t test_svreinterpret_mf8_f64_x2(svfloat64x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_s16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svint16x2_t test_svreinterpret_s16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_u16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svuint16x2_t test_svreinterpret_u16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s32_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_s32_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svint32x2_t test_svreinterpret_s32_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u32_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_u32_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svuint32x2_t test_svreinterpret_u32_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_s64_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_s64_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svint64x2_t test_svreinterpret_s64_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_u64_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_u64_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svuint64x2_t test_svreinterpret_u64_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_f16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_f16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svfloat16x2_t test_svreinterpret_f16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_bf16_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z30test_svreinterpret_bf16_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svbfloat16x2_t test_svreinterpret_bf16_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_f32_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_f32_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svfloat32x2_t test_svreinterpret_f32_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , } @test_svreinterpret_f64_mf8_x2( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-NEXT: ret { , } [[TMP7]] +// +// CHECK-CXX-LABEL: define dso_local { , } @_Z29test_svreinterpret_f64_mf8_x213svmfloat8x2_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = extractvalue { , } [[TMP1]], 0 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , } [[TMP1]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , } [[TMP4]], [[TMP6]], 1 +// CHECK-CXX-NEXT: ret { , } [[TMP7]] +// +svfloat64x2_t test_svreinterpret_f64_mf8_x2(svmfloat8x2_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x2)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s8_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_s8_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svint8x3_t test_svreinterpret_s8_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u8_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_u8_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svuint8x3_t test_svreinterpret_u8_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_mf8_s8_x310svint8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svmfloat8x3_t test_svreinterpret_mf8_s8_x3(svint8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z28test_svreinterpret_mf8_u8_x311svuint8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svmfloat8x3_t test_svreinterpret_mf8_u8_x3(svuint8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-NEXT: ret { , , } [[TMP8]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = insertvalue { , , } poison, [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , } [[TMP4]], [[TMP5]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP6]], [[TMP7]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP8]] +// +svmfloat8x3_t test_svreinterpret_mf8_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_s16_x311svint16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_s16_x3(svint16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_u16_x312svuint16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_u16_x3(svuint16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s32_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_s32_x311svint32x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_s32_x3(svint32x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u32_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_u32_x312svuint32x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_u32_x3(svuint32x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_s64_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_s64_x311svint64x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_s64_x3(svint64x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x3)(op); +} + +// +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_u64_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_u64_x312svuint64x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_u64_x3(svuint64x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_f16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_f16_x313svfloat16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_f16_x3(svfloat16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_bf16_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z30test_svreinterpret_mf8_bf16_x314svbfloat16x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_bf16_x3(svbfloat16x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_f32_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_f32_x313svfloat32x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_f32_x3(svfloat32x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_mf8_f64_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_mf8_f64_x313svfloat64x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svmfloat8x3_t test_svreinterpret_mf8_f64_x3(svfloat64x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_s16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svint16x3_t test_svreinterpret_s16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_u16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svuint16x3_t test_svreinterpret_u16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s32_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_s32_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svint32x3_t test_svreinterpret_s32_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u32_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_u32_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svuint32x3_t test_svreinterpret_u32_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_s64_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_s64_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svint64x3_t test_svreinterpret_s64_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_u64_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_u64_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svuint64x3_t test_svreinterpret_u64_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_f16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_f16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svfloat16x3_t test_svreinterpret_f16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_bf16_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z30test_svreinterpret_bf16_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svbfloat16x3_t test_svreinterpret_bf16_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_f32_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_f32_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svfloat32x3_t test_svreinterpret_f32_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , } @test_svreinterpret_f64_mf8_x3( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-NEXT: ret { , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , } @_Z29test_svreinterpret_f64_mf8_x313svmfloat8x3_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = bitcast [[TMP6]] to +// CHECK-CXX-NEXT: [[TMP8:%.*]] = insertvalue { , , } [[TMP5]], [[TMP7]], 1 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = bitcast [[TMP9]] to +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , } [[TMP8]], [[TMP10]], 2 +// CHECK-CXX-NEXT: ret { , , } [[TMP11]] +// +svfloat64x3_t test_svreinterpret_f64_mf8_x3(svmfloat8x3_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x3)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s8_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_s8_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svint8x4_t test_svreinterpret_s8_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s8, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u8_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_u8_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svuint8x4_t test_svreinterpret_u8_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u8, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_mf8_s8_x410svint8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svmfloat8x4_t test_svreinterpret_mf8_s8_x4(svint8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z28test_svreinterpret_mf8_u8_x411svuint8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svmfloat8x4_t test_svreinterpret_mf8_u8_x4(svuint8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-NEXT: ret { , , , } [[TMP11]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP11]] +// +svmfloat8x4_t test_svreinterpret_mf8_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_s16_x411svint16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_s16_x4(svint16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_u16_x412svuint16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_u16_x4(svuint16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s32_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_s32_x411svint32x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_s32_x4(svint32x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s32_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u32_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_u32_x412svuint32x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_u32_x4(svuint32x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u32_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_s64_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_s64_x411svint64x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_s64_x4(svint64x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _s64_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_u64_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_u64_x412svuint64x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_u64_x4(svuint64x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _u64_x4)(op); +} + +// +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_f16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_f16_x413svfloat16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_f16_x4(svfloat16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_bf16_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z30test_svreinterpret_mf8_bf16_x414svbfloat16x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_bf16_x4(svbfloat16x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _bf16_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_f32_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_f32_x413svfloat32x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_f32_x4(svfloat32x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f32_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_mf8_f64_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_mf8_f64_x413svfloat64x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svmfloat8x4_t test_svreinterpret_mf8_f64_x4(svfloat64x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_mf8, _f64_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_s16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svint16x4_t test_svreinterpret_s16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_u16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svuint16x4_t test_svreinterpret_u16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s32_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_s32_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svint32x4_t test_svreinterpret_s32_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s32, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u32_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_u32_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svuint32x4_t test_svreinterpret_u32_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u32, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_s64_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_s64_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svint64x4_t test_svreinterpret_s64_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_s64, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_u64_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_u64_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svuint64x4_t test_svreinterpret_u64_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_u64, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_f16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_f16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svfloat16x4_t test_svreinterpret_f16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_bf16_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z30test_svreinterpret_bf16_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svbfloat16x4_t test_svreinterpret_bf16_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_bf16, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_f32_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_f32_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svfloat32x4_t test_svreinterpret_f32_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f32, _mf8_x4)(op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_svreinterpret_f64_mf8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-NEXT: ret { , , , } [[TMP15]] +// +// CHECK-CXX-LABEL: define dso_local { , , , } @_Z29test_svreinterpret_f64_mf8_x413svmfloat8x4_t( +// CHECK-CXX-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]], [[OP_COERCE2:%.*]], [[OP_COERCE3:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = insertvalue { , , , } poison, [[OP_COERCE0]], 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = insertvalue { , , , } [[TMP0]], [[OP_COERCE1]], 1 +// CHECK-CXX-NEXT: [[TMP2:%.*]] = insertvalue { , , , } [[TMP1]], [[OP_COERCE2]], 2 +// CHECK-CXX-NEXT: [[TMP3:%.*]] = insertvalue { , , , } [[TMP2]], [[OP_COERCE3]], 3 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[TMP3]], 0 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = bitcast [[TMP4]] to +// CHECK-CXX-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[TMP5]], 0 +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP3]], 1 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = bitcast [[TMP7]] to +// CHECK-CXX-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[TMP8]], 1 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[TMP3]], 2 +// CHECK-CXX-NEXT: [[TMP11:%.*]] = bitcast [[TMP10]] to +// CHECK-CXX-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP11]], 2 +// CHECK-CXX-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[TMP3]], 3 +// CHECK-CXX-NEXT: [[TMP14:%.*]] = bitcast [[TMP13]] to +// CHECK-CXX-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[TMP14]], 3 +// CHECK-CXX-NEXT: ret { , , , } [[TMP15]] +// +svfloat64x4_t test_svreinterpret_f64_mf8_x4(svmfloat8x4_t op) STREAMING { + return SVE_ACLE_FUNC(svreinterpret_f64, _mf8_x4)(op); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 97b768db3a313..35477cfc3cf45 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -295,7 +295,7 @@ class SVEEmitter { const char *Suffix; }; - static const std::array Reinterprets; + static const std::array Reinterprets; const RecordKeeper &Records; StringMap EltTypes; @@ -418,9 +418,10 @@ class SVEEmitter { SmallVectorImpl> &Out); }; -const std::array SVEEmitter::Reinterprets = +const std::array SVEEmitter::Reinterprets = {{{SVEType("c", 'd'), "s8"}, {SVEType("Uc", 'd'), "u8"}, + {SVEType("m", 'd'), "mf8"}, {SVEType("s", 'd'), "s16"}, {SVEType("Us", 'd'), "u16"}, {SVEType("i", 'd'), "s32"}, From 5f18abe8300fb13b2bc4264ddaca1744a2e61d75 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 13 Jan 2025 10:51:26 -0800 Subject: [PATCH 095/102] [SLP]Correctly set vector operand for extracts with poisons When extracts are vectorized and it has some poison values instead of instructions, need to correctly set the vectorized operand not as poison, but as a main vector operand of the main extract instruction. Fixes #122583 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 11 +++++++++++ .../X86/extractelemets-extended-by-poison.ll | 8 ++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4b0ed5b30179b..2742c3777c1ed 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2441,6 +2441,17 @@ class BoUpSLP { // operations or alternating sequences (e.g., +, -), we can safely // tell the inverse operations by checking commutativity. if (isa(VL[Lane])) { + if (auto *EI = dyn_cast(VL0)) { + if (OpIdx == 0) { + OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false}; + continue; + } + } else if (auto *EV = dyn_cast(VL0)) { + if (OpIdx == 0) { + OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false}; + continue; + } + } OpsVec[OpIdx][Lane] = { PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true, false}; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll index 6af59aee54e55..71390b643f43d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll @@ -5,18 +5,22 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 poison, 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[TMP12]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> , <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP4]], <4 x i64> [[TMP0]], i64 0) ; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 ; CHECK-NEXT: [[INC_3_3_I_1:%.*]] = or i64 [[TMP9]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> poison) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP15]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP10]], [[TMP11]] ; CHECK-NEXT: ret i32 [[OP_RDX]] ; From dc56c6248d0363a65d00d0460b90d62fbe70e509 Mon Sep 17 00:00:00 2001 From: Sean Perry Date: Mon, 13 Jan 2025 14:02:20 -0500 Subject: [PATCH 096/102] [libc++][z/OS] __cxx03 subdir was added by mistake (#122763) The header is a system header. It's not part of the headers in __cxx03. --- libcxx/include/__cxx03/__config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/__cxx03/__config b/libcxx/include/__cxx03/__config index 3e8f181664c97..880d14a50a052 100644 --- a/libcxx/include/__cxx03/__config +++ b/libcxx/include/__cxx03/__config @@ -230,7 +230,7 @@ _LIBCPP_HARDENING_MODE_DEBUG # endif # if defined(__MVS__) -# include <__cxx03/features.h> // for __NATIVE_ASCII_F +# include // for __NATIVE_ASCII_F # endif # if defined(_WIN32) From 4b9d7d0a4b280c44569d6188959315bf1c3c39f4 Mon Sep 17 00:00:00 2001 From: Kirill Stoimenov Date: Mon, 13 Jan 2025 19:03:40 +0000 Subject: [PATCH 097/102] Revert "[aarch64][win] Add support for import call optimization (equivalent to MSVC /d2ImportCallOptimization) (#121516)" Breaks sanitizer build: https://lab.llvm.org/buildbot/#/builders/52/builds/5179 This reverts commits: 5ee0a71df919a328c714e25f0935c21e586cc18b d997a722c194feec5f3a94dec5acdce59ac5e55b --- llvm/include/llvm/CodeGen/MIRYamlMapping.h | 45 ++----- llvm/include/llvm/CodeGen/MachineFunction.h | 25 ---- llvm/include/llvm/CodeGen/SelectionDAG.h | 14 --- llvm/include/llvm/MC/MCObjectFileInfo.h | 5 - llvm/include/llvm/MC/MCStreamer.h | 8 -- llvm/include/llvm/MC/MCWinCOFFObjectWriter.h | 1 - llvm/include/llvm/MC/MCWinCOFFStreamer.h | 2 - llvm/lib/CodeGen/MIRParser/MIRParser.cpp | 74 ++---------- llvm/lib/CodeGen/MIRPrinter.cpp | 33 +---- .../SelectionDAG/ScheduleDAGSDNodes.cpp | 4 - llvm/lib/MC/MCAsmStreamer.cpp | 14 --- llvm/lib/MC/MCObjectFileInfo.cpp | 5 - llvm/lib/MC/MCParser/COFFAsmParser.cpp | 34 ------ llvm/lib/MC/MCStreamer.cpp | 4 - llvm/lib/MC/MCWinCOFFStreamer.cpp | 114 ------------------ llvm/lib/MC/WinCOFFObjectWriter.cpp | 27 ++--- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 72 ----------- .../Target/AArch64/AArch64ISelLowering.cpp | 14 +-- .../win-import-call-optimization-nocalls.ll | 18 --- .../AArch64/win-import-call-optimization.ll | 48 -------- .../CodeGen/MIR/AArch64/called-globals.mir | 61 ---------- .../CodeGen/MIR/X86/call-site-info-error1.mir | 2 +- .../CodeGen/MIR/X86/call-site-info-error2.mir | 2 +- .../MC/AArch64/win-import-call-optimization.s | 72 ----------- llvm/test/MC/COFF/bad-parse.s | 13 -- 25 files changed, 38 insertions(+), 673 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll delete mode 100644 llvm/test/CodeGen/AArch64/win-import-call-optimization.ll delete mode 100644 llvm/test/CodeGen/MIR/AArch64/called-globals.mir delete mode 100644 llvm/test/MC/AArch64/win-import-call-optimization.s delete mode 100644 llvm/test/MC/COFF/bad-parse.s diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index dbad3469d047d..09a6ca936fe1f 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -457,16 +457,6 @@ template <> struct ScalarTraits { static QuotingType mustQuote(StringRef S) { return needsQuotes(S); } }; -/// Identifies call instruction location in machine function. -struct MachineInstrLoc { - unsigned BlockNum; - unsigned Offset; - - bool operator==(const MachineInstrLoc &Other) const { - return BlockNum == Other.BlockNum && Offset == Other.Offset; - } -}; - /// Serializable representation of CallSiteInfo. struct CallSiteInfo { // Representation of call argument and register which is used to @@ -480,6 +470,16 @@ struct CallSiteInfo { } }; + /// Identifies call instruction location in machine function. + struct MachineInstrLoc { + unsigned BlockNum; + unsigned Offset; + + bool operator==(const MachineInstrLoc &Other) const { + return BlockNum == Other.BlockNum && Offset == Other.Offset; + } + }; + MachineInstrLoc CallLocation; std::vector ArgForwardingRegs; @@ -595,26 +595,6 @@ template <> struct MappingTraits { } }; -struct CalledGlobal { - MachineInstrLoc CallSite; - StringValue Callee; - unsigned Flags; - - bool operator==(const CalledGlobal &Other) const { - return CallSite == Other.CallSite && Callee == Other.Callee && - Flags == Other.Flags; - } -}; - -template <> struct MappingTraits { - static void mapping(IO &YamlIO, CalledGlobal &CG) { - YamlIO.mapRequired("bb", CG.CallSite.BlockNum); - YamlIO.mapRequired("offset", CG.CallSite.Offset); - YamlIO.mapRequired("callee", CG.Callee); - YamlIO.mapRequired("flags", CG.Flags); - } -}; - } // end namespace yaml } // end namespace llvm @@ -626,7 +606,6 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::FixedMachineStackObject) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CallSiteInfo) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineConstantPoolValue) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::MachineJumpTable::Entry) -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CalledGlobal) namespace llvm { namespace yaml { @@ -785,7 +764,6 @@ struct MachineFunction { std::vector DebugValueSubstitutions; MachineJumpTable JumpTableInfo; std::vector MachineMetadataNodes; - std::vector CalledGlobals; BlockStringValue Body; }; @@ -844,9 +822,6 @@ template <> struct MappingTraits { if (!YamlIO.outputting() || !MF.MachineMetadataNodes.empty()) YamlIO.mapOptional("machineMetadataNodes", MF.MachineMetadataNodes, std::vector()); - if (!YamlIO.outputting() || !MF.CalledGlobals.empty()) - YamlIO.mapOptional("calledGlobals", MF.CalledGlobals, - std::vector()); YamlIO.mapOptional("body", MF.Body, BlockStringValue()); } }; diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index 282aee2a69c4d..d696add8a1af5 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -354,11 +354,6 @@ class LLVM_ABI MachineFunction { /// a table of valid targets for Windows EHCont Guard. std::vector CatchretTargets; - /// Mapping of call instruction to the global value and target flags that it - /// calls, if applicable. - DenseMap> - CalledGlobalsMap; - /// \name Exception Handling /// \{ @@ -1187,26 +1182,6 @@ class LLVM_ABI MachineFunction { CatchretTargets.push_back(Target); } - /// Tries to get the global and target flags for a call site, if the - /// instruction is a call to a global. - std::pair - tryGetCalledGlobal(const MachineInstr *MI) const { - return CalledGlobalsMap.lookup(MI); - } - - /// Notes the global and target flags for a call site. - void addCalledGlobal(const MachineInstr *MI, - std::pair Details) { - assert(MI && "MI must not be null"); - assert(Details.first && "Global must not be null"); - CalledGlobalsMap.insert({MI, Details}); - } - - /// Iterates over the full set of call sites and their associated globals. - auto getCalledGlobals() const { - return llvm::make_range(CalledGlobalsMap.begin(), CalledGlobalsMap.end()); - } - /// \name Exception Handling /// \{ diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index b31ad11c3ee0e..ff7caec41855f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -293,7 +293,6 @@ class SelectionDAG { MDNode *HeapAllocSite = nullptr; MDNode *PCSections = nullptr; MDNode *MMRA = nullptr; - std::pair CalledGlobal{}; bool NoMerge = false; }; /// Out-of-line extra information for SDNodes. @@ -2374,19 +2373,6 @@ class SelectionDAG { auto It = SDEI.find(Node); return It != SDEI.end() ? It->second.MMRA : nullptr; } - /// Set CalledGlobal to be associated with Node. - void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, - unsigned OpFlags) { - SDEI[Node].CalledGlobal = {GV, OpFlags}; - } - /// Return CalledGlobal associated with Node, or a nullopt if none exists. - std::optional> - getCalledGlobal(const SDNode *Node) { - auto I = SDEI.find(Node); - return I != SDEI.end() - ? std::make_optional(std::move(I->second).CalledGlobal) - : std::nullopt; - } /// Set NoMergeSiteInfo to be associated with Node if NoMerge is true. void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge) { if (NoMerge) diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h index fb575fe721015..e2a2c84e47910 100644 --- a/llvm/include/llvm/MC/MCObjectFileInfo.h +++ b/llvm/include/llvm/MC/MCObjectFileInfo.h @@ -73,10 +73,6 @@ class MCObjectFileInfo { /// to emit them into. MCSection *CompactUnwindSection = nullptr; - /// If import call optimization is supported by the target, this is the - /// section to emit import call data to. - MCSection *ImportCallSection = nullptr; - // Dwarf sections for debug info. If a target supports debug info, these must // be set. MCSection *DwarfAbbrevSection = nullptr; @@ -273,7 +269,6 @@ class MCObjectFileInfo { MCSection *getBSSSection() const { return BSSSection; } MCSection *getReadOnlySection() const { return ReadOnlySection; } MCSection *getLSDASection() const { return LSDASection; } - MCSection *getImportCallSection() const { return ImportCallSection; } MCSection *getCompactUnwindSection() const { return CompactUnwindSection; } MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; } MCSection *getDwarfInfoSection() const { return DwarfInfoSection; } diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index 558b14cebfd3d..21da4dac4872b 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -569,14 +569,6 @@ class MCStreamer { /// \param Symbol - Symbol the image relative relocation should point to. virtual void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset); - /// Emits the physical number of the section containing the given symbol as - /// assigned during object writing (i.e., this is not a runtime relocation). - virtual void emitCOFFSecNumber(MCSymbol const *Symbol); - - /// Emits the offset of the symbol from the beginning of the section during - /// object writing (i.e., this is not a runtime relocation). - virtual void emitCOFFSecOffset(MCSymbol const *Symbol); - /// Emits an lcomm directive with XCOFF csect information. /// /// \param LabelSym - Label on the block of storage. diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h index 13d8c7d060c9e..a4ede61e45099 100644 --- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h +++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h @@ -72,7 +72,6 @@ class WinCOFFObjectWriter final : public MCObjectWriter { const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; uint64_t writeObject(MCAssembler &Asm) override; - int getSectionNumber(const MCSection &Section) const; }; /// Construct a new Win COFF writer instance. diff --git a/llvm/include/llvm/MC/MCWinCOFFStreamer.h b/llvm/include/llvm/MC/MCWinCOFFStreamer.h index 2425abe51e6dd..5c39d80538944 100644 --- a/llvm/include/llvm/MC/MCWinCOFFStreamer.h +++ b/llvm/include/llvm/MC/MCWinCOFFStreamer.h @@ -58,8 +58,6 @@ class MCWinCOFFStreamer : public MCObjectStreamer { void emitCOFFSectionIndex(MCSymbol const *Symbol) override; void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override; void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override; - void emitCOFFSecNumber(MCSymbol const *Symbol) override; - void emitCOFFSecOffset(MCSymbol const *Symbol) override; void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size, Align ByteAlignment) override; void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index de2fe925c2d5c..e2543f883f91c 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -158,9 +158,6 @@ class MIRParserImpl { MachineFunction &MF, const yaml::MachineFunction &YMF); - bool parseCalledGlobals(PerFunctionMIParsingState &PFS, MachineFunction &MF, - const yaml::MachineFunction &YMF); - private: bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node, const yaml::StringValue &Source); @@ -186,9 +183,6 @@ class MIRParserImpl { void setupDebugValueTracking(MachineFunction &MF, PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF); - - bool parseMachineInst(MachineFunction &MF, yaml::MachineInstrLoc MILoc, - MachineInstr const *&MI); }; } // end namespace llvm @@ -463,34 +457,24 @@ bool MIRParserImpl::computeFunctionProperties( return false; } -bool MIRParserImpl::parseMachineInst(MachineFunction &MF, - yaml::MachineInstrLoc MILoc, - MachineInstr const *&MI) { - if (MILoc.BlockNum >= MF.size()) { - return error(Twine(MF.getName()) + - Twine(" instruction block out of range.") + - " Unable to reference bb:" + Twine(MILoc.BlockNum)); - } - auto BB = std::next(MF.begin(), MILoc.BlockNum); - if (MILoc.Offset >= BB->size()) - return error( - Twine(MF.getName()) + Twine(" instruction offset out of range.") + - " Unable to reference instruction at bb: " + Twine(MILoc.BlockNum) + - " at offset:" + Twine(MILoc.Offset)); - MI = &*std::next(BB->instr_begin(), MILoc.Offset); - return false; -} - bool MIRParserImpl::initializeCallSiteInfo( PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF) { MachineFunction &MF = PFS.MF; SMDiagnostic Error; const TargetMachine &TM = MF.getTarget(); for (auto &YamlCSInfo : YamlMF.CallSitesInfo) { - yaml::MachineInstrLoc MILoc = YamlCSInfo.CallLocation; - const MachineInstr *CallI; - if (parseMachineInst(MF, MILoc, CallI)) - return true; + yaml::CallSiteInfo::MachineInstrLoc MILoc = YamlCSInfo.CallLocation; + if (MILoc.BlockNum >= MF.size()) + return error(Twine(MF.getName()) + + Twine(" call instruction block out of range.") + + " Unable to reference bb:" + Twine(MILoc.BlockNum)); + auto CallB = std::next(MF.begin(), MILoc.BlockNum); + if (MILoc.Offset >= CallB->size()) + return error(Twine(MF.getName()) + + Twine(" call instruction offset out of range.") + + " Unable to reference instruction at bb: " + + Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset)); + auto CallI = std::next(CallB->instr_begin(), MILoc.Offset); if (!CallI->isCall(MachineInstr::IgnoreBundle)) return error(Twine(MF.getName()) + Twine(" call site info should reference call " @@ -657,9 +641,6 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, if (initializeCallSiteInfo(PFS, YamlMF)) return true; - if (parseCalledGlobals(PFS, MF, YamlMF)) - return true; - setupDebugValueTracking(MF, PFS, YamlMF); MF.getSubtarget().mirFileLoaded(MF); @@ -1130,37 +1111,6 @@ bool MIRParserImpl::parseMachineMetadataNodes( return false; } -bool MIRParserImpl::parseCalledGlobals(PerFunctionMIParsingState &PFS, - MachineFunction &MF, - const yaml::MachineFunction &YMF) { - Function &F = MF.getFunction(); - for (const auto &YamlCG : YMF.CalledGlobals) { - yaml::MachineInstrLoc MILoc = YamlCG.CallSite; - const MachineInstr *CallI; - if (parseMachineInst(MF, MILoc, CallI)) - return true; - if (!CallI->isCall(MachineInstr::IgnoreBundle)) - return error(Twine(MF.getName()) + - Twine(" called global should reference call " - "instruction. Instruction at bb:") + - Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset) + - " is not a call instruction"); - - auto Callee = - F.getParent()->getValueSymbolTable().lookup(YamlCG.Callee.Value); - if (!Callee) - return error(YamlCG.Callee.SourceRange.Start, - "use of undefined global '" + YamlCG.Callee.Value + "'"); - if (!isa(Callee)) - return error(YamlCG.Callee.SourceRange.Start, - "use of non-global value '" + YamlCG.Callee.Value + "'"); - - MF.addCalledGlobal(CallI, {cast(Callee), YamlCG.Flags}); - } - - return false; -} - SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error, SMRange SourceRange) { assert(SourceRange.isValid() && "Invalid source range"); diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index b6da495590fe1..c8f6341c1224d 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -133,9 +133,6 @@ class MIRPrinter { void convertMachineMetadataNodes(yaml::MachineFunction &YMF, const MachineFunction &MF, MachineModuleSlotTracker &MST); - void convertCalledGlobals(yaml::MachineFunction &YMF, - const MachineFunction &MF, - MachineModuleSlotTracker &MST); private: void initRegisterMaskIds(const MachineFunction &MF); @@ -272,8 +269,6 @@ void MIRPrinter::print(const MachineFunction &MF) { // function. convertMachineMetadataNodes(YamlMF, MF, MST); - convertCalledGlobals(YamlMF, MF, MST); - yaml::Output Out(OS); if (!SimplifyMIR) Out.setWriteDefaultValues(true); @@ -560,7 +555,7 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF, const auto *TRI = MF.getSubtarget().getRegisterInfo(); for (auto CSInfo : MF.getCallSitesInfo()) { yaml::CallSiteInfo YmlCS; - yaml::MachineInstrLoc CallLocation; + yaml::CallSiteInfo::MachineInstrLoc CallLocation; // Prepare instruction position. MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator(); @@ -601,32 +596,6 @@ void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF, } } -void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF, - const MachineFunction &MF, - MachineModuleSlotTracker &MST) { - for (const auto &[CallInst, CG] : MF.getCalledGlobals()) { - // If the call instruction was dropped, then we don't need to print it. - auto BB = CallInst->getParent(); - if (BB) { - yaml::MachineInstrLoc CallSite; - CallSite.BlockNum = CallInst->getParent()->getNumber(); - CallSite.Offset = std::distance(CallInst->getParent()->instr_begin(), - CallInst->getIterator()); - - yaml::CalledGlobal YamlCG{CallSite, CG.first->getName().str(), CG.second}; - YMF.CalledGlobals.push_back(YamlCG); - } - } - - // Sort by position of call instructions. - llvm::sort(YMF.CalledGlobals.begin(), YMF.CalledGlobals.end(), - [](yaml::CalledGlobal A, yaml::CalledGlobal B) { - if (A.CallSite.BlockNum == B.CallSite.BlockNum) - return A.CallSite.Offset < B.CallSite.Offset; - return A.CallSite.BlockNum < B.CallSite.BlockNum; - }); -} - void MIRPrinter::convert(yaml::MachineFunction &MF, const MachineConstantPool &ConstantPool) { unsigned ID = 0; diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index bafe26ff7d6b7..dff7243b0a99c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -908,10 +908,6 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { It->setMMRAMetadata(MF, MMRA); } - if (auto CalledGlobal = DAG->getCalledGlobal(Node)) - if (CalledGlobal->first) - MF.addCalledGlobal(MI, *CalledGlobal); - return MI; }; diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index dd8058c6d5cd8..01fe11ed20501 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -209,8 +209,6 @@ class MCAsmStreamer final : public MCStreamer { void emitCOFFSectionIndex(MCSymbol const *Symbol) override; void emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override; void emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override; - void emitCOFFSecNumber(MCSymbol const *Symbol) override; - void emitCOFFSecOffset(MCSymbol const *Symbol) override; void emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size, MCSymbol *CsectSym, Align Alignment) override; void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol, @@ -895,18 +893,6 @@ void MCAsmStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) { EmitEOL(); } -void MCAsmStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { - OS << "\t.secnum\t"; - Symbol->print(OS, MAI); - EmitEOL(); -} - -void MCAsmStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { - OS << "\t.secoffset\t"; - Symbol->print(OS, MAI); - EmitEOL(); -} - // We need an XCOFF-specific version of this directive as the AIX syntax // requires a QualName argument identifying the csect name and storage mapping // class to appear before the alignment if we are specifying it. diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index 150e38a94db6a..f37e138edc36b 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -596,11 +596,6 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { COFF::IMAGE_SCN_MEM_READ); } - if (T.getArch() == Triple::aarch64) { - ImportCallSection = - Ctx->getCOFFSection(".impcall", COFF::IMAGE_SCN_LNK_INFO); - } - // Debug info. COFFDebugSymbolsSection = Ctx->getCOFFSection(".debug$S", (COFF::IMAGE_SCN_MEM_DISCARDABLE | diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp index dd5ce9964a194..4d95a72085283 100644 --- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp @@ -70,8 +70,6 @@ class COFFAsmParser : public MCAsmParserExtension { addDirectiveHandler<&COFFAsmParser::parseDirectiveSymbolAttribute>( ".weak_anti_dep"); addDirectiveHandler<&COFFAsmParser::parseDirectiveCGProfile>(".cg_profile"); - addDirectiveHandler<&COFFAsmParser::parseDirectiveSecNum>(".secnum"); - addDirectiveHandler<&COFFAsmParser::parseDirectiveSecOffset>(".secoffset"); // Win64 EH directives. addDirectiveHandler<&COFFAsmParser::parseSEHDirectiveStartProc>( @@ -128,8 +126,6 @@ class COFFAsmParser : public MCAsmParserExtension { bool parseDirectiveLinkOnce(StringRef, SMLoc); bool parseDirectiveRVA(StringRef, SMLoc); bool parseDirectiveCGProfile(StringRef, SMLoc); - bool parseDirectiveSecNum(StringRef, SMLoc); - bool parseDirectiveSecOffset(StringRef, SMLoc); // Win64 EH directives. bool parseSEHDirectiveStartProc(StringRef, SMLoc); @@ -581,36 +577,6 @@ bool COFFAsmParser::parseDirectiveSymIdx(StringRef, SMLoc) { return false; } -bool COFFAsmParser::parseDirectiveSecNum(StringRef, SMLoc) { - StringRef SymbolID; - if (getParser().parseIdentifier(SymbolID)) - return TokError("expected identifier in directive"); - - if (getLexer().isNot(AsmToken::EndOfStatement)) - return TokError("unexpected token in directive"); - - MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID); - - Lex(); - getStreamer().emitCOFFSecNumber(Symbol); - return false; -} - -bool COFFAsmParser::parseDirectiveSecOffset(StringRef, SMLoc) { - StringRef SymbolID; - if (getParser().parseIdentifier(SymbolID)) - return TokError("expected identifier in directive"); - - if (getLexer().isNot(AsmToken::EndOfStatement)) - return TokError("unexpected token in directive"); - - MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID); - - Lex(); - getStreamer().emitCOFFSecOffset(Symbol); - return false; -} - /// ::= [ identifier ] bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) { StringRef TypeId = getTok().getIdentifier(); diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index e690723c0e502..ccf65df150e78 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -1023,10 +1023,6 @@ void MCStreamer::emitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) {} void MCStreamer::emitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {} -void MCStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {} - -void MCStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {} - /// EmitRawText - If this file is backed by an assembly streamer, this dumps /// the specified string in the output .s file. This capability is /// indicated by the hasRawTextSupport() predicate. diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 8fd46bc8b0255..395d4db3103d7 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -29,7 +29,6 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSymbolCOFF.h" #include "llvm/MC/MCTargetOptions.h" -#include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" @@ -44,91 +43,6 @@ using namespace llvm; #define DEBUG_TYPE "WinCOFFStreamer" -/// MCExpr that represents the physical number for the sections that contains -/// a symbol. -class MCCOFFSectionNumberTargetExpr final : public MCTargetExpr { - const MCSymbol &SectionSymbol; - const WinCOFFObjectWriter &Writer; - - MCCOFFSectionNumberTargetExpr(const MCSymbol &SectionSymbol_, - const WinCOFFObjectWriter &Writer_) - : SectionSymbol(SectionSymbol_), Writer(Writer_) {} - -public: - static MCCOFFSectionNumberTargetExpr * - create(const MCSymbol &SectionSymbol, const WinCOFFObjectWriter &Writer, - MCContext &Ctx) { - return new (Ctx) MCCOFFSectionNumberTargetExpr(SectionSymbol, Writer); - } - - void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override { - OS << ":secnum:"; - SectionSymbol.print(OS, MAI); - } - - bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, - const MCFixup *Fixup) const override { - auto sectionNumber = Writer.getSectionNumber(SectionSymbol.getSection()); - assert(sectionNumber != 0 && - "Containing section was not assigned a number"); - Res = MCValue::get(sectionNumber); - return true; - } - - void visitUsedExpr(MCStreamer &Streamer) const override { - // Contains no sub-expressions. - } - - MCFragment *findAssociatedFragment() const override { - return SectionSymbol.getFragment(); - } - - void fixELFSymbolsInTLSFixups(MCAssembler &) const override { - llvm_unreachable("Not supported for ELF"); - } -}; - -/// MCExpr that represents the offset to a symbol from the beginning of its -/// section. -class MCCOFFSectionOffsetTargetExpr final : public MCTargetExpr { - const MCSymbol &Symbol; - - MCCOFFSectionOffsetTargetExpr(const MCSymbol &Symbol_) : Symbol(Symbol_) {} - -public: - static MCCOFFSectionOffsetTargetExpr *create(const MCSymbol &Symbol, - MCContext &Ctx) { - return new (Ctx) MCCOFFSectionOffsetTargetExpr(Symbol); - } - - void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override { - OS << ":secoffset:"; - Symbol.print(OS, MAI); - } - - bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, - const MCFixup *Fixup) const override { - uint64_t CallsiteOffset = 0; - if (!Asm->getSymbolOffset(Symbol, CallsiteOffset)) { - return true; - } - Res = MCValue::get(CallsiteOffset); - return true; - } - - void visitUsedExpr(MCStreamer &Streamer) const override { - // Contains no sub-expressions. - } - - MCFragment *findAssociatedFragment() const override { - return Symbol.getFragment(); - } - - void fixELFSymbolsInTLSFixups(MCAssembler &) const override { - llvm_unreachable("Not supported for ELF"); - } -}; - MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context, std::unique_ptr MAB, std::unique_ptr CE, @@ -366,34 +280,6 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol, DF->appendContents(4, 0); } -void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) { - visitUsedSymbol(*Symbol); - MCDataFragment *DF = getOrCreateDataFragment(); - // Create Symbol for section number. - const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create( - *Symbol, this->getWriter(), getContext()); - // Build the relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4); - // Record the relocation. - DF->getFixups().push_back(Fixup); - // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); -} - -void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) { - visitUsedSymbol(*Symbol); - MCDataFragment *DF = getOrCreateDataFragment(); - // Create Symbol for section offset. - const MCExpr *MCE = - MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext()); - // Build the relocation. - MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4); - // Record the relocation. - DF->getFixups().push_back(Fixup); - // Emit 4 bytes (zeros) to the object file. - DF->appendContents(4, 0); -} - void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size, Align ByteAlignment) { auto *Symbol = cast(S); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 39e02d0522bcf..09d2b08e43050 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -163,7 +163,6 @@ class llvm::WinCOFFWriter { const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue); uint64_t writeObject(MCAssembler &Asm); - int getSectionNumber(const MCSection &Section) const; private: COFFSymbol *createSymbol(StringRef Name); @@ -819,15 +818,6 @@ void WinCOFFWriter::executePostLayoutBinding(MCAssembler &Asm) { if (!Symbol.isTemporary() || cast(Symbol).getClass() == COFF::IMAGE_SYM_CLASS_STATIC) defineSymbol(Asm, Symbol); - - UseBigObj = Sections.size() > COFF::MaxNumberOfSections16; - Header.NumberOfSections = Sections.size(); - Header.NumberOfSymbols = 0; - if (Sections.size() > INT32_MAX) - report_fatal_error( - "PE COFF object files can't have more than 2147483647 sections"); - - assignSectionNumbers(); } void WinCOFFWriter::recordRelocation(MCAssembler &Asm, @@ -990,7 +980,16 @@ static std::time_t getTime() { uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) { uint64_t StartOffset = W.OS.tell(); + if (Sections.size() > INT32_MAX) + report_fatal_error( + "PE COFF object files can't have more than 2147483647 sections"); + + UseBigObj = Sections.size() > COFF::MaxNumberOfSections16; + Header.NumberOfSections = Sections.size(); + Header.NumberOfSymbols = 0; + setWeakDefaultNames(); + assignSectionNumbers(); if (Mode != DwoOnly) createFileSymbols(Asm); @@ -1144,10 +1143,6 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) { return W.OS.tell() - StartOffset; } -int WinCOFFWriter::getSectionNumber(const MCSection &Section) const { - return SectionMap.at(&Section)->Number; -} - //------------------------------------------------------------------------------ // WinCOFFObjectWriter class implementation @@ -1199,10 +1194,6 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm) { return TotalSize; } -int WinCOFFObjectWriter::getSectionNumber(const MCSection &Section) const { - return ObjWriter->getSectionNumber(Section); -} - MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) : Machine(Machine_) {} diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 27e65d60122fd..9d9d9889b3858 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -24,7 +24,6 @@ #include "MCTargetDesc/AArch64TargetStreamer.h" #include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" @@ -78,11 +77,6 @@ static cl::opt PtrauthAuthChecks( cl::desc("Check pointer authentication auth/resign failures"), cl::init(Default)); -static cl::opt EnableImportCallOptimization( - "aarch64-win-import-call-optimization", cl::Hidden, - cl::desc("Enable import call optimization for AArch64 Windows"), - cl::init(false)); - #define DEBUG_TYPE "asm-printer" namespace { @@ -95,8 +89,6 @@ class AArch64AsmPrinter : public AsmPrinter { #ifndef NDEBUG unsigned InstsEmitted; #endif - DenseMap>> - SectionToImportedFunctionCalls; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -301,11 +293,6 @@ class AArch64AsmPrinter : public AsmPrinter { MCSymbol *LazyPointer) override; void emitMachOIFuncStubHelperBody(Module &M, const GlobalIFunc &GI, MCSymbol *LazyPointer) override; - - /// Checks if this instruction is part of a sequence that is eligle for import - /// call optimization and, if so, records it to be emitted in the import call - /// section. - void recordIfImportCall(const MachineInstr *BranchInst); }; } // end anonymous namespace @@ -943,38 +930,6 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) { // Emit stack and fault map information. FM.serializeToFaultMapSection(); - // If import call optimization is enabled, emit the appropriate section. - // We do this whether or not we recorded any import calls. - if (EnableImportCallOptimization && TT.isOSBinFormatCOFF()) { - OutStreamer->switchSection(getObjFileLowering().getImportCallSection()); - - // Section always starts with some magic. - constexpr char ImpCallMagic[12] = "Imp_Call_V1"; - OutStreamer->emitBytes(StringRef{ImpCallMagic, sizeof(ImpCallMagic)}); - - // Layout of this section is: - // Per section that contains calls to imported functions: - // uint32_t SectionSize: Size in bytes for information in this section. - // uint32_t Section Number - // Per call to imported function in section: - // uint32_t Kind: the kind of imported function. - // uint32_t BranchOffset: the offset of the branch instruction in its - // parent section. - // uint32_t TargetSymbolId: the symbol id of the called function. - for (auto &[Section, CallsToImportedFuncs] : - SectionToImportedFunctionCalls) { - unsigned SectionSize = - sizeof(uint32_t) * (2 + 3 * CallsToImportedFuncs.size()); - OutStreamer->emitInt32(SectionSize); - OutStreamer->emitCOFFSecNumber(Section->getBeginSymbol()); - for (auto &[CallsiteSymbol, CalledSymbol] : CallsToImportedFuncs) { - // Kind is always IMAGE_REL_ARM64_DYNAMIC_IMPORT_CALL (0x13). - OutStreamer->emitInt32(0x13); - OutStreamer->emitCOFFSecOffset(CallsiteSymbol); - OutStreamer->emitCOFFSymbolIndex(CalledSymbol); - } - } - } } void AArch64AsmPrinter::emitLOHs() { @@ -2748,7 +2703,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { case AArch64::TCRETURNriALL: { emitPtrauthTailCallHardening(MI); - recordIfImportCall(MI); MCInst TmpInst; TmpInst.setOpcode(AArch64::BR); TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); @@ -2760,7 +2714,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { MCOperand Dest; MCInstLowering.lowerOperand(MI->getOperand(0), Dest); - recordIfImportCall(MI); MCInst TmpInst; TmpInst.setOpcode(AArch64::B); TmpInst.addOperand(Dest); @@ -3091,14 +3044,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { TS->emitARM64WinCFISaveAnyRegQPX(MI->getOperand(0).getImm(), -MI->getOperand(2).getImm()); return; - - case AArch64::BLR: - case AArch64::BR: - recordIfImportCall(MI); - MCInst TmpInst; - MCInstLowering.Lower(MI, TmpInst); - EmitToStreamer(*OutStreamer, TmpInst); - return; } // Finally, do the automated lowerings for everything else. @@ -3107,23 +3052,6 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); } -void AArch64AsmPrinter::recordIfImportCall( - const llvm::MachineInstr *BranchInst) { - if (!EnableImportCallOptimization || - !TM.getTargetTriple().isOSBinFormatCOFF()) - return; - - auto [GV, OpFlags] = BranchInst->getMF()->tryGetCalledGlobal(BranchInst); - if (GV && GV->hasDLLImportStorageClass()) { - auto *CallSiteSymbol = MMI->getContext().createNamedTempSymbol("impcall"); - OutStreamer->emitLabel(CallSiteSymbol); - - auto *CalledSymbol = MCInstLowering.GetGlobalValueSymbol(GV, OpFlags); - SectionToImportedFunctionCalls[OutStreamer->getCurrentSectionOnly()] - .push_back({CallSiteSymbol, CalledSymbol}); - } -} - void AArch64AsmPrinter::emitMachOIFuncStubBody(Module &M, const GlobalIFunc &GI, MCSymbol *LazyPointer) { // _ifunc: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 278dd95cd969d..7e82a433a85ad 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9450,14 +9450,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - const GlobalValue *CalledGlobal = nullptr; - unsigned OpFlags = 0; if (auto *G = dyn_cast(Callee)) { - CalledGlobal = G->getGlobal(); - OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal, - getTargetMachine()); + auto GV = G->getGlobal(); + unsigned OpFlags = + Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); if (OpFlags & AArch64II::MO_GOT) { - Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else { const GlobalValue *GV = G->getGlobal(); @@ -9577,8 +9575,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); - if (CalledGlobal) - DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags); return Ret; } @@ -9590,8 +9586,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InGlue = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); - if (CalledGlobal) - DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll deleted file mode 100644 index 81d6d6369dcbf..0000000000000 --- a/llvm/test/CodeGen/AArch64/win-import-call-optimization-nocalls.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s - -define dso_local void @normal_call() local_unnamed_addr { -entry: - call void @a() - ret void -} -; CHECK-LABEL: normal_call: -; CHECK: bl a - -declare void @a() local_unnamed_addr - -; Even if there are no calls to imported functions, we still need to emit the -; .impcall section. - -; CHECK-LABEL .section .impcall,"yi" -; CHECK-NEXT .asciz "Imp_Call_V1" -; CHECK-NOT .secnum diff --git a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll b/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll deleted file mode 100644 index 6bb118ba1e159..0000000000000 --- a/llvm/test/CodeGen/AArch64/win-import-call-optimization.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -mtriple=aarch64-pc-windows-msvc -aarch64-win-import-call-optimization < %s | FileCheck %s --check-prefix=CHECK-ENABLED -; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-DISABLED - -; CHECK-DISABLED-NOT: .section .impcall - -define dso_local void @normal_call() local_unnamed_addr section "nc_sect" { -entry: - call void @a() - call void @a() - ret void -} -; CHECK-ENABLED-LABEL: normal_call: -; CHECK-ENABLED: adrp [[ADRPREG:x[0-9]+]], __imp_a -; CHECK-ENABLED-NEXT: ldr [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_a] -; CHECK-ENABLED-NEXT: .Limpcall0: -; CHECK-ENABLED-NEXT: blr [[LDRREG]] -; CHECK-ENABLED-NEXT: .Limpcall1: -; CHECK-ENABLED-NEXT: blr [[LDRREG]] - -define dso_local void @tail_call() local_unnamed_addr section "tc_sect" { -entry: - tail call void @b() - ret void -} -; CHECK-ENABLED-LABEL: tail_call: -; CHECK-ENABLED: adrp [[ADRPREG:x[0-9]+]], __imp_b -; CHECK-ENABLED-NEXT: ldr [[LDRREG:x[0-9]+]], [[[ADRPREG]], :lo12:__imp_b] -; CHECK-ENABLED-NEXT: .Limpcall2: -; CHECK-ENABLED-NEXT: br [[LDRREG]] - -declare dllimport void @a() local_unnamed_addr -declare dllimport void @b() local_unnamed_addr - -; CHECK-ENABLED-LABEL .section .impcall,"yi" -; CHECK-ENABLED-NEXT .asciz "Imp_Call_V1" -; CHECK-ENABLED-NEXT .word 32 -; CHECK-ENABLED-NEXT .secnum nc_sect -; CHECK-ENABLED-NEXT .word 19 -; CHECK-ENABLED-NEXT .secoffset .Limpcall0 -; CHECK-ENABLED-NEXT .symidx __imp_a -; CHECK-ENABLED-NEXT .word 19 -; CHECK-ENABLED-NEXT .secoffset .Limpcall1 -; CHECK-ENABLED-NEXT .symidx __imp_a -; CHECK-ENABLED-NEXT .word 20 -; CHECK-ENABLED-NEXT .secnum tc_sect -; CHECK-ENABLED-NEXT .word 19 -; CHECK-ENABLED-NEXT .secoffset .Limpcall2 -; CHECK-ENABLED-NEXT .symidx __imp_b diff --git a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir b/llvm/test/CodeGen/MIR/AArch64/called-globals.mir deleted file mode 100644 index cf0f0a23e2d91..0000000000000 --- a/llvm/test/CodeGen/MIR/AArch64/called-globals.mir +++ /dev/null @@ -1,61 +0,0 @@ -# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s - ---- | - declare dllimport void @callee_func() local_unnamed_addr - - define dso_local void @caller() local_unnamed_addr { - entry: - call void @callee_func() - call void @callee_func() - ret void - } -... ---- -name: caller -stack: - - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '$x19', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -calledGlobals: - - bb: 0 - offset: 7 - callee: callee_func - flags: 144 - - bb: 0 - offset: 8 - callee: callee_func - flags: 144 -body: | - bb.0.entry: - liveins: $x19, $lr - - early-clobber $sp = frame-setup STRXpre killed $x19, $sp, -16 :: (store (s64) into %stack.1) - frame-setup SEH_SaveReg_X 19, -16 - frame-setup STRXui killed $lr, $sp, 1 :: (store (s64) into %stack.0) - frame-setup SEH_SaveReg 30, 8 - frame-setup SEH_PrologEnd - $x19 = ADRP target-flags(aarch64-page, aarch64-got, aarch64-dllimport) @callee_func - renamable $x19 = LDRXui killed $x19, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc, aarch64-dllimport) @callee_func - BLR renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp - BLR killed renamable $x19, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp - frame-destroy SEH_EpilogStart - $lr = frame-destroy LDRXui $sp, 1 :: (load (s64) from %stack.0) - frame-destroy SEH_SaveReg 30, 8 - early-clobber $sp, $x19 = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1) - frame-destroy SEH_SaveReg_X 19, -16 - frame-destroy SEH_EpilogEnd - RET undef $lr -... - -# CHECK-LABEL: calledGlobals: -# CHECK-NEXT: - bb: 0 -# CHECK-NEXT: offset: 7 -# CHECK-NEXT: callee: callee_func -# CHECK-NEXT: flags: 144 -# CHECK-NEXT: - bb: 0 -# CHECK-NEXT: offset: 8 -# CHECK-NEXT: callee: callee_func -# CHECK-NEXT: flags: 144 diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir index e4dab779216a8..096a80f77dbb6 100644 --- a/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error1.mir @@ -1,5 +1,5 @@ # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s -# CHECK: baa instruction block out of range. Unable to reference bb:1 +# CHECK: baa call instruction block out of range. Unable to reference bb:1 --- | define dso_local i32 @baa(i32 %a) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir index 183610b326eeb..bd5b2451a8d76 100644 --- a/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error2.mir @@ -1,5 +1,5 @@ # RUN: not llc -mtriple=x86_64-- -run-pass none -debug-entry-values %s -o - 2>&1 | FileCheck %s -# CHECK: baa instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1 +# CHECK: baa call instruction offset out of range. Unable to reference instruction at bb: 0 at offset:1 --- | define dso_local i32 @baa(i32 %a) local_unnamed_addr { entry: diff --git a/llvm/test/MC/AArch64/win-import-call-optimization.s b/llvm/test/MC/AArch64/win-import-call-optimization.s deleted file mode 100644 index f26e17b9b62cc..0000000000000 --- a/llvm/test/MC/AArch64/win-import-call-optimization.s +++ /dev/null @@ -1,72 +0,0 @@ -// RUN: llvm-mc -triple aarch64-windows-msvc -filetype obj -o %t.obj %s -// RUN: llvm-readobj --sections --sd --relocs %t.obj | FileCheck %s - -.section nc_sect,"xr" -normal_call: - str x30, [sp, #-16]! // 8-byte Folded Spill - adrp x8, __imp_a - ldr x8, [x8, :lo12:__imp_a] -.Limpcall0: - blr x8 - ldr x30, [sp], #16 // 8-byte Folded Reload - ret - -.section tc_sect,"xr" -tail_call: - adrp x8, __imp_b - ldr x8, [x8, :lo12:__imp_b] -.Limpcall1: - br x8 - -.section .impcall,"yi" -.asciz "Imp_Call_V1" -.word 20 -.secnum nc_sect -.word 19 -.secoffset .Limpcall0 -.symidx __imp_a -.word 20 -.secnum tc_sect -.word 19 -.secoffset .Limpcall1 -.symidx __imp_b - -// CHECK-LABEL: Name: .impcall (2E 69 6D 70 63 61 6C 6C) -// CHECK-NEXT: VirtualSize: 0x0 -// CHECK-NEXT: VirtualAddress: 0x0 -// CHECK-NEXT: RawDataSize: 52 -// CHECK-NEXT: PointerToRawData: 0x150 -// CHECK-NEXT: PointerToRelocations: 0x0 -// CHECK-NEXT: PointerToLineNumbers: 0x0 -// CHECK-NEXT: RelocationCount: 0 -// CHECK-NEXT: LineNumberCount: 0 -// CHECK-NEXT: Characteristics [ -// CHECK-NEXT: IMAGE_SCN_ALIGN_4BYTES -// CHECK-NEXT: IMAGE_SCN_LNK_INFO -// CHECK-NEXT: ] -// CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 496D705F 43616C6C 5F563100 14000000 |Imp_Call_V1.....| -// CHECK-NEXT: 0010: -// CHECK-SAME: [[#%.2X,NCSECT:]]000000 -// CHECK-SAME: 13000000 -// CHECK-SAME: [[#%.2X,NCOFFSET:]]000000 -// CHECK-SAME: [[#%.2X,NCSYM:]]000000 -// CHECK-NEXT: 0020: -// CHECK-SAME: 14000000 -// CHECK-SAME: [[#%.2X,TCSECT:]]000000 -// CHECK-SAME: 13000000 -// CHECK-SAME: [[#%.2X,TCOFFSET:]]000000 -// CHECK-NEXT: 0030: -// CHECK-SAME: [[#%.2X,TCSYM:]]000000 -// CHECK-NEXT: ) - -// CHECK-LABEL: Relocations [ -// CHECK-NEXT: Section ([[#%u,NCSECT]]) nc_sect { -// CHECK-NEXT: 0x[[#%x,NCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_a ([[#%u,NCSYM]]) -// CHECK-NEXT: 0x[[#%x,NCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_a ([[#%u,NCSYM]]) -// CHECK-NEXT: } -// CHECK-NEXT: Section ([[#%u,TCSECT]]) tc_sect { -// CHECK-NEXT: 0x[[#%x,TCOFFSET - 8]] IMAGE_REL_ARM64_PAGEBASE_REL21 __imp_b ([[#%u,TCSYM]]) -// CHECK-NEXT: 0x[[#%x,TCOFFSET - 4]] IMAGE_REL_ARM64_PAGEOFFSET_12L __imp_b ([[#%u,TCSYM]]) -// CHECK-NEXT: } -// CHECK-NEXT: ] diff --git a/llvm/test/MC/COFF/bad-parse.s b/llvm/test/MC/COFF/bad-parse.s deleted file mode 100644 index 2491f41abeb4e..0000000000000 --- a/llvm/test/MC/COFF/bad-parse.s +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: not llvm-mc -filetype=obj -triple i386-pc-win32 %s 2>&1 | FileCheck %s - - .data - -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive - .secnum -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive - .secnum section extra - -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: expected identifier in directive - .secoffset -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unexpected token in directive - .secoffset section extra From 9bc9486a2af53377fe3b93bdf8b71e21d03b424e Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 13 Jan 2025 14:37:06 -0500 Subject: [PATCH 098/102] [lldb] Add amd64 ArchSpec (#122533) amd64 is used on OpenBSD. --- lldb/include/lldb/Utility/ArchSpec.h | 2 ++ lldb/source/Utility/ArchSpec.cpp | 4 ++++ lldb/unittests/Utility/ArchSpecTest.cpp | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h index 2a74058673bae..7e9bc23a75acb 100644 --- a/lldb/include/lldb/Utility/ArchSpec.h +++ b/lldb/include/lldb/Utility/ArchSpec.h @@ -215,6 +215,8 @@ class ArchSpec { eCore_x86_64_x86_64, eCore_x86_64_x86_64h, // Haswell enabled x86_64 + eCore_x86_64_amd64, + eCore_hexagon_generic, eCore_hexagon_hexagonv4, eCore_hexagon_hexagonv5, diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index 85bb85044ec15..b13e8ff1ec373 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -218,6 +218,9 @@ static const CoreDefinition g_core_definitions[] = { ArchSpec::eCore_x86_64_x86_64, "x86_64"}, {eByteOrderLittle, 8, 1, 15, llvm::Triple::x86_64, ArchSpec::eCore_x86_64_x86_64h, "x86_64h"}, + {eByteOrderLittle, 8, 1, 15, llvm::Triple::x86_64, + ArchSpec::eCore_x86_64_amd64, "amd64"}, + {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon, ArchSpec::eCore_hexagon_generic, "hexagon"}, {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon, @@ -1227,6 +1230,7 @@ static bool cores_match(const ArchSpec::Core core1, const ArchSpec::Core core2, break; case ArchSpec::eCore_x86_64_x86_64h: + case ArchSpec::eCore_x86_64_amd64: if (!enforce_exact_match) { try_inverse = false; if (core2 == ArchSpec::eCore_x86_64_x86_64) diff --git a/lldb/unittests/Utility/ArchSpecTest.cpp b/lldb/unittests/Utility/ArchSpecTest.cpp index de3590b73bbaa..74a4b48456b01 100644 --- a/lldb/unittests/Utility/ArchSpecTest.cpp +++ b/lldb/unittests/Utility/ArchSpecTest.cpp @@ -129,6 +129,12 @@ TEST(ArchSpecTest, TestSetTriple) { EXPECT_STREQ("msp430", AS.GetArchitectureName()); EXPECT_EQ(ArchSpec::eCore_msp430, AS.GetCore()); + AS = ArchSpec(); + EXPECT_TRUE(AS.SetTriple("amd64-unknown-openbsd")); + EXPECT_EQ(llvm::Triple::x86_64, AS.GetTriple().getArch()); + EXPECT_STREQ("amd64", AS.GetArchitectureName()); + EXPECT_EQ(ArchSpec::eCore_x86_64_amd64, AS.GetCore()); + // Various flavors of invalid triples. AS = ArchSpec(); EXPECT_FALSE(AS.SetTriple("unknown-unknown-unknown")); From 92d5c6294ade3f225e54e623970dff4dd23f79c0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 13 Jan 2025 11:37:37 -0800 Subject: [PATCH 099/102] [ADT] Deprecate PointerUnion::{is,get} (NFC) (#122623) PointerUnion::{is,get} have been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast This patch actually deprecates them with [[deprecated]]. I'm not touching PointerUnion::dyn_cast for now because we have not migrated away from it yet. --- llvm/include/llvm/ADT/PointerUnion.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/ADT/PointerUnion.h b/llvm/include/llvm/ADT/PointerUnion.h index 7d4ed02b62262..cdbd76d7f505b 100644 --- a/llvm/include/llvm/ADT/PointerUnion.h +++ b/llvm/include/llvm/ADT/PointerUnion.h @@ -147,12 +147,18 @@ class PointerUnion // isa, cast and the llvm::dyn_cast /// Test if the Union currently holds the type matching T. - template inline bool is() const { return isa(*this); } + template + [[deprecated("Use isa instead")]] + inline bool is() const { + return isa(*this); + } /// Returns the value of the specified pointer type. /// /// If the specified pointer type is incorrect, assert. - template inline T get() const { + template + [[deprecated("Use cast instead")]] + inline T get() const { assert(isa(*this) && "Invalid accessor called"); return cast(*this); } From 68123b39651c148e85a3d89f6aaee0d137dad720 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 13 Jan 2025 11:38:00 -0800 Subject: [PATCH 100/102] [AST] Migrate away from PointerUnion::dyn_cast (NFC) (#122651) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Ptr to be nonnull. --- clang/include/clang/AST/DeclBase.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 82932e098c86f..77abd8b657a61 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -1334,7 +1334,7 @@ class DeclListNode { reference operator*() const { assert(Ptr && "dereferencing end() iterator"); - if (DeclListNode *CurNode = Ptr.dyn_cast()) + if (DeclListNode *CurNode = dyn_cast(Ptr)) return CurNode->D; return cast(Ptr); } @@ -1344,7 +1344,7 @@ class DeclListNode { inline iterator &operator++() { // ++It assert(!Ptr.isNull() && "Advancing empty iterator"); - if (DeclListNode *CurNode = Ptr.dyn_cast()) + if (DeclListNode *CurNode = dyn_cast(Ptr)) Ptr = CurNode->Rest; else Ptr = nullptr; From 54b3f3b7ecf34d1b4bd5176119640e75097dd3ac Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 13 Jan 2025 11:38:20 -0800 Subject: [PATCH 101/102] [Analysis] Migrate away from PointerUnion::dyn_cast (NFC) (#122652) Note that PointerUnion::dyn_cast has been soft deprecated in PointerUnion.h: // FIXME: Replace the uses of is(), get() and dyn_cast() with // isa, cast and the llvm::dyn_cast Literal migration would result in dyn_cast_if_present (see the definition of PointerUnion::dyn_cast), but this patch uses dyn_cast because we expect Ctx->FunArgs to be nonnull. --- clang/lib/Analysis/ThreadSafetyCommon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp index 050daee1168d4..13cd7e26dc16f 100644 --- a/clang/lib/Analysis/ThreadSafetyCommon.cpp +++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp @@ -336,7 +336,7 @@ til::SExpr *SExprBuilder::translateDeclRefExpr(const DeclRefExpr *DRE, : (cast(D)->getCanonicalDecl() == Canonical)) { // Substitute call arguments for references to function parameters if (const Expr *const *FunArgs = - Ctx->FunArgs.dyn_cast()) { + dyn_cast(Ctx->FunArgs)) { assert(I < Ctx->NumArgs); return translate(FunArgs[I], Ctx->Prev); } From 39bf93460652061f2522a29a7bd6a90ff39b1f00 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 13 Jan 2025 11:40:24 -0800 Subject: [PATCH 102/102] Address comments. --- clang/lib/CodeGen/CGCall.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index a71af0141709f..e0cf6ca69f0df 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4507,7 +4507,7 @@ void CodeGenFunction::EmitCallArgs( // First, if a prototype was provided, use those argument types. bool IsVariadic = false; if (Prototype.P) { - const auto *MD = dyn_cast_if_present(Prototype.P); + const auto *MD = dyn_cast(Prototype.P); if (MD) { IsVariadic = MD->isVariadic(); ExplicitCC = getCallingConventionForDecl(