diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 9f124394363a3..0df36b326e5dc 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -531,23 +531,47 @@ class OpLowerer { return make_error( "typedBufferStore data must be a vector of 4 elements", inconvertibleErrorCode()); - Value *Data0 = - IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 0)); - Value *Data1 = - IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 1)); - Value *Data2 = - IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 2)); - Value *Data3 = - IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, 3)); - - std::array Args{Handle, Index0, Index1, Data0, - Data1, Data2, Data3, Mask}; + + // Since we're post-scalarizer, we likely have a vector that's constructed + // solely for the argument of the store. If so, just use the scalar values + // from before they're inserted into the temporary. + std::array DataElements{nullptr, nullptr, nullptr, nullptr}; + auto *IEI = dyn_cast(Data); + while (IEI) { + auto *IndexOp = dyn_cast(IEI->getOperand(2)); + if (!IndexOp) + break; + size_t IndexVal = IndexOp->getZExtValue(); + assert(IndexVal < 4 && "Too many elements for buffer store"); + DataElements[IndexVal] = IEI->getOperand(1); + IEI = dyn_cast(IEI->getOperand(0)); + } + + // If for some reason we weren't able to forward the arguments from the + // scalarizer artifact, then we need to actually extract elements from the + // vector. + for (int I = 0, E = 4; I != E; ++I) + if (DataElements[I] == nullptr) + DataElements[I] = + IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, I)); + + std::array Args{ + Handle, Index0, Index1, DataElements[0], + DataElements[1], DataElements[2], DataElements[3], Mask}; Expected OpCall = OpBuilder.tryCreateOp(OpCode::BufferStore, Args, CI->getName()); if (Error E = OpCall.takeError()) return E; CI->eraseFromParent(); + // Clean up any leftover `insertelement`s + IEI = dyn_cast(Data); + while (IEI && IEI->use_empty()) { + InsertElementInst *Tmp = IEI; + IEI = dyn_cast(IEI->getOperand(0)); + Tmp->eraseFromParent(); + } + return Error::success(); }); } diff --git a/llvm/test/CodeGen/DirectX/BufferStore.ll b/llvm/test/CodeGen/DirectX/BufferStore.ll index 9ea7735be59c8..81cc5fd328e0a 100644 --- a/llvm/test/CodeGen/DirectX/BufferStore.ll +++ b/llvm/test/CodeGen/DirectX/BufferStore.ll @@ -90,3 +90,27 @@ define void @storei16(<4 x i16> %data, i32 %index) { ret void } + +define void @store_scalarized_floats(float %data0, float %data1, float %data2, float %data3, i32 %index) { + + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]] + %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; We shouldn't end up with any inserts/extracts. + ; CHECK-NOT: insertelement + ; CHECK-NOT: extractelement + + ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, float %data0, float %data1, float %data2, float %data3, i8 15) + %vec.upto0 = insertelement <4 x float> poison, float %data0, i64 0 + %vec.upto1 = insertelement <4 x float> %vec.upto0, float %data1, i64 1 + %vec.upto2 = insertelement <4 x float> %vec.upto1, float %data2, i64 2 + %vec = insertelement <4 x float> %vec.upto2, float %data3, i64 3 + call void @llvm.dx.typedBufferStore( + target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, + i32 %index, <4 x float> %vec) + + ret void +}