diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp index 7559f61b4cfb9..44277971acd60 100644 --- a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp @@ -11,9 +11,11 @@ #include "llvm/Frontend/HLSL/CBuffer.h" #include "llvm/Frontend/HLSL/HLSLResource.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Transforms/Utils/Local.h" #define DEBUG_TYPE "dxil-cbuffer-access" @@ -54,71 +56,106 @@ struct CBufferRowIntrin { } } }; -} // namespace -static size_t getOffsetForCBufferGEP(GEPOperator *GEP, GlobalVariable *Global, - const DataLayout &DL) { - // Since we should always have a constant offset, we should only ever have a - // single GEP of indirection from the Global. - assert(GEP->getPointerOperand() == Global && - "Indirect access to resource handle"); +// Helper for creating CBuffer handles and loading data from them +struct CBufferResource { + GlobalVariable *GVHandle; + GlobalVariable *Member; + size_t MemberOffset; - APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); - bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset); - (void)Success; - assert(Success && "Offsets into cbuffer globals must be constant"); + LoadInst *Handle; - if (auto *ATy = dyn_cast(Global->getValueType())) - ConstantOffset = hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy); + CBufferResource(GlobalVariable *GVHandle, GlobalVariable *Member, + size_t MemberOffset) + : GVHandle(GVHandle), Member(Member), MemberOffset(MemberOffset) {} - return ConstantOffset.getZExtValue(); -} + const DataLayout &getDataLayout() { return GVHandle->getDataLayout(); } + Type *getValueType() { return Member->getValueType(); } + iterator_range users() { + return Member->users(); + } + + /// Get the byte offset of a Pointer-typed Value * `Val` relative to Member. + /// `Val` can either be Member itself, or a GEP of a constant offset from + /// Member + size_t getOffsetForCBufferGEP(Value *Val) { + assert(isa(Val->getType()) && + "Expected a pointer-typed value"); + + if (Val == Member) + return 0; + + if (auto *GEP = dyn_cast(Val)) { + // Since we should always have a constant offset, we should only ever have + // a single GEP of indirection from the Global. + assert(GEP->getPointerOperand() == Member && + "Indirect access to resource handle"); + + const DataLayout &DL = getDataLayout(); + APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); + bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset); + (void)Success; + assert(Success && "Offsets into cbuffer globals must be constant"); + + if (auto *ATy = dyn_cast(Member->getValueType())) + ConstantOffset = + hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy); + + return ConstantOffset.getZExtValue(); + } -/// Replace access via cbuffer global with a load from the cbuffer handle -/// itself. -static void replaceAccess(LoadInst *LI, GlobalVariable *Global, - GlobalVariable *HandleGV, size_t BaseOffset, - SmallVectorImpl &DeadInsts) { - const DataLayout &DL = HandleGV->getDataLayout(); + llvm_unreachable("Expected Val to be a GlobalVariable or GEP"); + } - size_t Offset = BaseOffset; - if (auto *GEP = dyn_cast(LI->getPointerOperand())) - Offset += getOffsetForCBufferGEP(GEP, Global, DL); - else if (LI->getPointerOperand() != Global) - llvm_unreachable("Load instruction doesn't reference cbuffer global"); + /// Create a handle for this cbuffer resource using the IRBuilder `Builder` + /// and sets the handle as the current one to use for subsequent calls to + /// `loadValue` + void createAndSetCurrentHandle(IRBuilder<> &Builder) { + Handle = Builder.CreateLoad(GVHandle->getValueType(), GVHandle, + GVHandle->getName()); + } - IRBuilder<> Builder(LI); - auto *Handle = Builder.CreateLoad(HandleGV->getValueType(), HandleGV, - HandleGV->getName()); - - Type *Ty = LI->getType(); - CBufferRowIntrin Intrin(DL, Ty->getScalarType()); - // The cbuffer consists of some number of 16-byte rows. - unsigned int CurrentRow = Offset / hlsl::CBufferRowSizeInBytes; - unsigned int CurrentIndex = - (Offset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize; - - auto *CBufLoad = Builder.CreateIntrinsic( - Intrin.RetTy, Intrin.IID, - {Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr, - LI->getName()); - auto *Elt = - Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, LI->getName()); - - Value *Result = nullptr; - unsigned int Remaining = - ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1; - if (Remaining == 0) { - // We only have a single element, so we're done. - Result = Elt; - - // However, if we loaded a <1 x T>, then we need to adjust the type here. - if (auto *VT = dyn_cast(LI->getType())) { - assert(VT->getNumElements() == 1 && "Can't have multiple elements here"); - Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result, - Builder.getInt32(0)); + /// Load a value of type `Ty` at offset `Offset` using the handle from the + /// last call to `createAndSetCurrentHandle` + Value *loadValue(IRBuilder<> &Builder, Type *Ty, size_t Offset, + const Twine &Name = "") { + assert(Handle && + "Expected a handle for this cbuffer global resource to be created " + "before loading a value from it"); + const DataLayout &DL = getDataLayout(); + + size_t TargetOffset = MemberOffset + Offset; + CBufferRowIntrin Intrin(DL, Ty->getScalarType()); + // The cbuffer consists of some number of 16-byte rows. + unsigned int CurrentRow = TargetOffset / hlsl::CBufferRowSizeInBytes; + unsigned int CurrentIndex = + (TargetOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize; + + auto *CBufLoad = Builder.CreateIntrinsic( + Intrin.RetTy, Intrin.IID, + {Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr, + Name + ".load"); + auto *Elt = Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, + Name + ".extract"); + + Value *Result = nullptr; + unsigned int Remaining = + ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1; + + if (Remaining == 0) { + // We only have a single element, so we're done. + Result = Elt; + + // However, if we loaded a <1 x T>, then we need to adjust the type here. + if (auto *VT = dyn_cast(Ty)) { + assert(VT->getNumElements() == 1 && + "Can't have multiple elements here"); + Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result, + Builder.getInt32(0), Name); + } + return Result; } - } else { + // Walk each element and extract it, wrapping to new rows as needed. SmallVector Extracts{Elt}; while (Remaining--) { @@ -128,40 +165,138 @@ static void replaceAccess(LoadInst *LI, GlobalVariable *Global, CBufLoad = Builder.CreateIntrinsic( Intrin.RetTy, Intrin.IID, {Handle, ConstantInt::get(Builder.getInt32Ty(), ++CurrentRow)}, - nullptr, LI->getName()); + nullptr, Name + ".load"); Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, - LI->getName())); + Name + ".extract")); } // Finally, we build up the original loaded value. Result = PoisonValue::get(Ty); for (int I = 0, E = Extracts.size(); I < E; ++I) Result = - Builder.CreateInsertElement(Result, Extracts[I], Builder.getInt32(I)); + Builder.CreateInsertElement(Result, Extracts[I], Builder.getInt32(I), + Name + formatv(".upto{}", I)); + return Result; } +}; +} // namespace + +/// Replace load via cbuffer global with a load from the cbuffer handle itself. +static void replaceLoad(LoadInst *LI, CBufferResource &CBR, + SmallVectorImpl &DeadInsts) { + size_t Offset = CBR.getOffsetForCBufferGEP(LI->getPointerOperand()); + IRBuilder<> Builder(LI); + CBR.createAndSetCurrentHandle(Builder); + Value *Result = CBR.loadValue(Builder, LI->getType(), Offset, LI->getName()); LI->replaceAllUsesWith(Result); DeadInsts.push_back(LI); } -static void replaceAccessesWithHandle(GlobalVariable *Global, - GlobalVariable *HandleGV, - size_t BaseOffset) { +/// This function recursively copies N array elements from the cbuffer resource +/// CBR to the MemCpy Destination. Recursion is used to unravel multidimensional +/// arrays into a sequence of scalar/vector extracts and stores. +static void copyArrayElemsForMemCpy(IRBuilder<> &Builder, MemCpyInst *MCI, + CBufferResource &CBR, ArrayType *ArrTy, + size_t ArrOffset, size_t N, + const Twine &Name = "") { + const DataLayout &DL = MCI->getDataLayout(); + Type *ElemTy = ArrTy->getElementType(); + size_t ElemTySize = DL.getTypeAllocSize(ElemTy); + for (unsigned I = 0; I < N; ++I) { + size_t Offset = ArrOffset + I * ElemTySize; + + // Recursively copy nested arrays + if (ArrayType *ElemArrTy = dyn_cast(ElemTy)) { + copyArrayElemsForMemCpy(Builder, MCI, CBR, ElemArrTy, Offset, + ElemArrTy->getNumElements(), Name); + continue; + } + + // Load CBuffer value and store it in Dest + APInt CBufArrayOffset( + DL.getIndexTypeSizeInBits(MCI->getSource()->getType()), Offset); + CBufArrayOffset = + hlsl::translateCBufArrayOffset(DL, CBufArrayOffset, ArrTy); + Value *CBufferVal = + CBR.loadValue(Builder, ElemTy, CBufArrayOffset.getZExtValue(), Name); + Value *GEP = + Builder.CreateInBoundsGEP(Builder.getInt8Ty(), MCI->getDest(), + {Builder.getInt32(Offset)}, Name + ".dest"); + Builder.CreateStore(CBufferVal, GEP, MCI->isVolatile()); + } +} + +/// Replace memcpy from a cbuffer global with a memcpy from the cbuffer handle +/// itself. Assumes the cbuffer global is an array, and the length of bytes to +/// copy is divisible by array element allocation size. +/// The memcpy source must also be a direct cbuffer global reference, not a GEP. +static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR) { + + ArrayType *ArrTy = dyn_cast(CBR.getValueType()); + assert(ArrTy && "MemCpy lowering is only supported for array types"); + + // This assumption vastly simplifies the implementation + if (MCI->getSource() != CBR.Member) + reportFatalUsageError( + "Expected MemCpy source to be a cbuffer global variable"); + + ConstantInt *Length = dyn_cast(MCI->getLength()); + uint64_t ByteLength = Length->getZExtValue(); + + // If length to copy is zero, no memcpy is needed + if (ByteLength == 0) { + MCI->eraseFromParent(); + return; + } + + const DataLayout &DL = CBR.getDataLayout(); + + Type *ElemTy = ArrTy->getElementType(); + size_t ElemSize = DL.getTypeAllocSize(ElemTy); + assert(ByteLength % ElemSize == 0 && + "Length of bytes to MemCpy must be divisible by allocation size of " + "source/destination array elements"); + size_t ElemsToCpy = ByteLength / ElemSize; + + IRBuilder<> Builder(MCI); + CBR.createAndSetCurrentHandle(Builder); + + copyArrayElemsForMemCpy(Builder, MCI, CBR, ArrTy, 0, ElemsToCpy, + "memcpy." + MCI->getDest()->getName() + "." + + MCI->getSource()->getName()); + + MCI->eraseFromParent(); +} + +static void replaceAccessesWithHandle(CBufferResource &CBR) { SmallVector DeadInsts; - SmallVector ToProcess{Global->users()}; + SmallVector ToProcess{CBR.users()}; while (!ToProcess.empty()) { User *Cur = ToProcess.pop_back_val(); // If we have a load instruction, replace the access. if (auto *LI = dyn_cast(Cur)) { - replaceAccess(LI, Global, HandleGV, BaseOffset, DeadInsts); + replaceLoad(LI, CBR, DeadInsts); + continue; + } + + // If we have a memcpy instruction, replace it with multiple accesses and + // subsequent stores to the destination + if (auto *MCI = dyn_cast(Cur)) { + replaceMemCpy(MCI, CBR); continue; } // Otherwise, walk users looking for a load... - ToProcess.append(Cur->user_begin(), Cur->user_end()); + if (isa(Cur) || isa(Cur)) { + ToProcess.append(Cur->user_begin(), Cur->user_end()); + continue; + } + + llvm_unreachable("Unexpected user of Global"); } RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); } @@ -173,7 +308,8 @@ static bool replaceCBufferAccesses(Module &M) { for (const hlsl::CBufferMapping &Mapping : *CBufMD) for (const hlsl::CBufferMember &Member : Mapping.Members) { - replaceAccessesWithHandle(Member.GV, Mapping.Handle, Member.Offset); + CBufferResource CBR(Mapping.Handle, Member.GV, Member.Offset); + replaceAccessesWithHandle(CBR); Member.GV->removeFromParent(); } diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll new file mode 100644 index 0000000000000..001f3320137a6 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll @@ -0,0 +1,216 @@ +; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; double3 a2[2]; +; float16_t a3[2][2]; +; uint64_t a4[3]; +; int2 a5[3][2]; +; uint16_t a6[1]; +; int64_t a7[2]; +; bool a8[4]; +; } +%__cblayout_CB = type <{ [3 x float], [2 x <3 x double>], [2 x [2 x half]], [3 x i64], [3 x [2 x <2 x i32>]], [1 x i16], [2 x i64], [4 x i32] }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 272, 288, 320)) poison +@a1 = external local_unnamed_addr addrspace(2) global [3 x float], align 4 +@a2 = external local_unnamed_addr addrspace(2) global [2 x <3 x double>], align 32 +@a3 = external local_unnamed_addr addrspace(2) global [2 x [2 x half]], align 2 +@a4 = external local_unnamed_addr addrspace(2) global [3 x i64], align 8 +@a5 = external local_unnamed_addr addrspace(2) global [3 x [2 x <2 x i32>]], align 16 +@a6 = external local_unnamed_addr addrspace(2) global [1 x i16], align 2 +@a7 = external local_unnamed_addr addrspace(2) global [2 x i64], align 8 +@a8 = external local_unnamed_addr addrspace(2) global [4 x i32], align 4 + +; CHECK: define void @f( +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 272, 288, 320)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null) + store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 708, 0, 48, 112, 176, 224, 272, 288, 320)) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + %a1.copy = alloca [3 x float], align 4 + %a2.copy = alloca [2 x <3 x double>], align 32 + %a3.copy = alloca [2 x [2 x half]], align 2 + %a4.copy = alloca [3 x i64], align 8 + %a5.copy = alloca [3 x [2 x <2 x i32>]], align 16 + %a6.copy = alloca [1 x i16], align 2 + %a7.copy = alloca [2 x i64], align 8 + %a8.copy = alloca [4 x i32], align 4 + + ; Try copying no elements +; CHECK-NOT: memcpy + call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a1.copy, ptr addrspace(2) align 4 @a1, i32 0, i1 false) + + ; Try copying only the first element +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) +; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY:%.*]], i32 0 +; CHECK: store float [[X]], ptr [[DEST]], align 4 + call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a1.copy, ptr addrspace(2) align 4 @a1, i32 4, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 0) +; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY:%.*]], i32 0 +; CHECK: store float [[X]], ptr [[DEST]], align 4 +; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 1) +; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY]], i32 4 +; CHECK: store float [[Y]], ptr [[DEST]], align 4 +; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 2) +; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A1_COPY]], i32 8 +; CHECK: store float [[Z]], ptr [[DEST]], align 4 + call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a1.copy, ptr addrspace(2) align 4 @a1, i32 12, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 3) +; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 +; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 4) +; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 +; CHECK: [[UPTO0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1 +; CHECK: [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY:%.*]], i32 0 +; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 32 +; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 5) +; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 +; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 6) +; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 +; CHECK: [[UPTO0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1 +; CHECK: [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 32 +; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 32 + call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 64, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 7) +; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY:%.*]], i32 0 +; CHECK: store half [[X]], ptr [[DEST]], align 2 +; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 8) +; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY]], i32 2 +; CHECK: store half [[Y]], ptr [[DEST]], align 2 +; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 9) +; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY]], i32 4 +; CHECK: store half [[X]], ptr [[DEST]], align 2 +; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 10) +; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A3_COPY]], i32 6 +; CHECK: store half [[Y]], ptr [[DEST]], align 2 + call void @llvm.memcpy.p0.p2.i32(ptr align 2 %a3.copy, ptr addrspace(2) align 2 @a3, i32 8, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 11) +; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A4_COPY:%.*]], i32 0 +; CHECK: store i64 [[X]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 12) +; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A4_COPY]], i32 8 +; CHECK: store i64 [[Y]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 13) +; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A4_COPY]], i32 16 +; CHECK: store i64 [[Z]], ptr [[DEST]], align 8 + call void @llvm.memcpy.p0.p2.i32(ptr align 8 %a4.copy, ptr addrspace(2) align 8 @a4, i32 24, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 14) +; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 +; CHECK: [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY:%.*]], i32 0 +; CHECK: store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 15) +; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 +; CHECK: [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 8 +; CHECK: store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 16) +; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 +; CHECK: [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 16 +; CHECK: store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 17) +; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 +; CHECK: [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 24 +; CHECK: store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 18) +; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 +; CHECK: [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 32 +; CHECK: store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 19) +; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 +; CHECK: [[UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[X]], i32 0 +; CHECK: [[UPTO1:%.*]] = insertelement <2 x i32> [[UPTO0]], i32 [[Y]], i32 1 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A5_COPY]], i32 40 +; CHECK: store <2 x i32> [[UPTO1]], ptr [[DEST]], align 8 + call void @llvm.memcpy.p0.p2.i32(ptr align 16 %a5.copy, ptr addrspace(2) align 16 @a5, i32 48, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 17) +; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A6_COPY:%.*]], i32 0 +; CHECK: store i16 [[X]], ptr [[DEST]], align 2 + call void @llvm.memcpy.p0.p2.i32(ptr align 2 %a6.copy, ptr addrspace(2) align 2 @a6, i32 2, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 18) +; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A7_COPY:%.*]], i32 0 +; CHECK: store i64 [[X]], ptr [[DEST]], align 8 +; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 19) +; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A7_COPY]], i32 8 +; CHECK: store i64 [[Y]], ptr [[DEST]], align 8 + call void @llvm.memcpy.p0.p2.i32(ptr align 8 %a7.copy, ptr addrspace(2) align 8 @a7, i32 16, i1 false) + +; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 20) +; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY:%.*]], i32 0 +; CHECK: store i32 [[X]], ptr [[DEST]], align 4 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 21) +; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY]], i32 4 +; CHECK: store i32 [[Y]], ptr [[DEST]], align 4 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 22) +; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY]], i32 8 +; CHECK: store i32 [[Z]], ptr [[DEST]], align 4 +; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 23) +; CHECK: [[W:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 +; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A8_COPY]], i32 12 +; CHECK: store i32 [[W]], ptr [[DEST]], align 4 + call void @llvm.memcpy.p0.p2.i32(ptr align 4 %a8.copy, ptr addrspace(2) align 4 @a8, i32 16, i1 false) + + ret void +} + +declare void @llvm.memcpy.p0.p2.i32(ptr noalias writeonly captures(none), ptr addrspace(2) noalias readonly captures(none), i32, i1 immarg) + +; CHECK-NOT: !hlsl.cbs = +!hlsl.cbs = !{!0} + +!0 = !{ptr @CB.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4, ptr addrspace(2) @a5, ptr addrspace(2) @a6, ptr addrspace(2) @a7, ptr addrspace(2) @a8} +!1 = !{i32 0, i32 2} +!2 = !{}