respond to pr comments

spall · spall · commit 5e2f7a5be668 · 2025-07-03T09:02:06.000-07:00
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -42,6 +42,15 @@ class DXILIntrinsicExpansionLegacy : public ModulePass {
   static char ID; // Pass identification.
 };
 
+static bool resourceAccessNeeds64BitExpansion(Module *M, Type *OverloadTy,
+                                              bool IsRaw) {
+  if (IsRaw && M->getTargetTriple().getDXILVersion() > VersionTuple(1, 2))
+    return false;
+
+  Type *ScalarTy = OverloadTy->getScalarType();
+  return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
+}
+
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
   case Intrinsic::abs:
@@ -72,27 +81,19 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::vector_reduce_fadd:
     return true;
   case Intrinsic::dx_resource_load_rawbuffer:
-    if (F.getParent()->getTargetTriple().getDXILVersion() > VersionTuple(1, 2))
-      return false;
-    // fallthrough to check if double or i64
-    LLVM_FALLTHROUGH;
-  case Intrinsic::dx_resource_load_typedbuffer: {
-    // We need to handle i64, doubles, and vectors of them.
-    Type *ScalarTy =
-        F.getReturnType()->getStructElementType(0)->getScalarType();
-    return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
-  }
-  case Intrinsic::dx_resource_store_rawbuffer: {
-    if (F.getParent()->getTargetTriple().getDXILVersion() > VersionTuple(1, 2))
-      return false;
-    Type *ScalarTy = F.getFunctionType()->getParamType(3)->getScalarType();
-    return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
-  }
-  case Intrinsic::dx_resource_store_typedbuffer: {
-    // We need to handle i64 and doubles and vectors of i64 and doubles.
-    Type *ScalarTy = F.getFunctionType()->getParamType(2)->getScalarType();
-    return ScalarTy->isDoubleTy() || ScalarTy->isIntegerTy(64);
-  }
+    return resourceAccessNeeds64BitExpansion(
+        F.getParent(), F.getReturnType()->getStructElementType(0),
+        /*IsRaw*/ true);
+  case Intrinsic::dx_resource_load_typedbuffer:
+    return resourceAccessNeeds64BitExpansion(
+        F.getParent(), F.getReturnType()->getStructElementType(0),
+        /*IsRaw*/ false);
+  case Intrinsic::dx_resource_store_rawbuffer:
+    return resourceAccessNeeds64BitExpansion(
+        F.getParent(), F.getFunctionType()->getParamType(3), /*IsRaw*/ true);
+  case Intrinsic::dx_resource_store_typedbuffer:
+    return resourceAccessNeeds64BitExpansion(
+        F.getParent(), F.getFunctionType()->getParamType(2), /*IsRaw*/ false);
   }
   return false;
 }
@@ -563,19 +564,20 @@ static bool expandBufferLoadIntrinsic(CallInst *Orig, bool IsRaw) {
   bool IsDouble = ScalarTy->isDoubleTy();
   assert(IsDouble || ScalarTy->isIntegerTy(64) &&
                          "Only expand double or int64 scalars or vectors");
-  bool IsVector = isa<FixedVectorType>(BufferTy);
-
+  bool IsVector = false;
   unsigned ExtractNum = 2;
   if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
-    if (!IsRaw)
-      assert(VT->getNumElements() == 2 &&
-             "TypedBufferLoad vector must be size 2");
     ExtractNum = 2 * VT->getNumElements();
+    IsVector = true;
+    assert(IsRaw || ExtractNum == 4 && "TypedBufferLoad vector must be size 2");
   }
 
   SmallVector<Value *, 2> Loads;
   Value *Result = PoisonValue::get(BufferTy);
   unsigned Base = 0;
+  // If we need to extract more than 4 i32; we need to break it up into
+  // more than one load. LoadNum tells us how many i32s we are loading in
+  // each load
   while (ExtractNum > 0) {
     unsigned LoadNum = std::min(ExtractNum, 4u);
     Type *Ty = VectorType::get(Builder.getInt32Ty(), LoadNum, false);
@@ -649,6 +651,8 @@ static bool expandBufferLoadIntrinsic(CallInst *Orig, bool IsRaw) {
     } else {
       // Use of the check bit
       assert(Indices[0] == 1 && "Unexpected type for typedbufferload");
+      // Note: This does not always match the historical behaviour of DXC.
+      // See https://github.com/microsoft/DirectXShaderCompiler/issues/7622
       if (!CheckBit) {
         SmallVector<Value *, 2> CheckBits;
         for (Value *L : Loads)
@@ -666,22 +670,22 @@ static bool expandBufferLoadIntrinsic(CallInst *Orig, bool IsRaw) {
 static bool expandBufferStoreIntrinsic(CallInst *Orig, bool IsRaw) {
   IRBuilder<> Builder(Orig);
 
-  Type *BufferTy = Orig->getFunctionType()->getParamType(IsRaw ? 3 : 2);
+  unsigned ValIndex = IsRaw ? 3 : 2;
+  Type *BufferTy = Orig->getFunctionType()->getParamType(ValIndex);
   Type *ScalarTy = BufferTy->getScalarType();
   bool IsDouble = ScalarTy->isDoubleTy();
   assert((IsDouble || ScalarTy->isIntegerTy(64)) &&
          "Only expand double or int64 scalars or vectors");
 
   // Determine if we're dealing with a vector or scalar
-  bool IsVector = isa<FixedVectorType>(BufferTy);
+  bool IsVector = false;
   unsigned ExtractNum = 2;
   unsigned VecLen = 0;
   if (auto *VT = dyn_cast<FixedVectorType>(BufferTy)) {
-    if (!IsRaw)
-      assert(VT->getNumElements() == 2 &&
-             "TypedBufferStore vector must be size 2");
     VecLen = VT->getNumElements();
+    assert(IsRaw || VecLen == 2 && "TypedBufferStore vector must be size 2");
     ExtractNum = VecLen * 2;
+    IsVector = true;
   }
 
   // Create the appropriate vector type for the result
@@ -699,12 +703,12 @@ static bool expandBufferStoreIntrinsic(CallInst *Orig, bool IsRaw) {
   if (IsDouble) {
     auto *SplitTy = llvm::StructType::get(SplitElementTy, SplitElementTy);
     Value *Split = Builder.CreateIntrinsic(SplitTy, Intrinsic::dx_splitdouble,
-                                           {Orig->getOperand(IsRaw ? 3 : 2)});
+                                           {Orig->getOperand(ValIndex)});
     LowBits = Builder.CreateExtractValue(Split, 0);
     HighBits = Builder.CreateExtractValue(Split, 1);
   } else {
     // Handle int64 type(s)
-    Value *InputVal = Orig->getOperand(IsRaw ? 3 : 2);
+    Value *InputVal = Orig->getOperand(ValIndex);
     Constant *ShiftAmt = Builder.getInt64(32);
     if (IsVector)
       ShiftAmt =
@@ -728,6 +732,9 @@ static bool expandBufferStoreIntrinsic(CallInst *Orig, bool IsRaw) {
     Val = Builder.CreateInsertElement(Val, HighBits, Builder.getInt32(1));
   }
 
+  // If we need to extract more than 4 i32; we need to break it up into
+  // more than one store. StoreNum tells us how many i32s we are storing in
+  // each store
   unsigned Base = 0;
   while (ExtractNum > 0) {
     unsigned StoreNum = std::min(ExtractNum, 4u);
@@ -744,7 +751,10 @@ static bool expandBufferStoreIntrinsic(CallInst *Orig, bool IsRaw) {
     for (unsigned I = 0; I < StoreNum; ++I) {
       Mask.push_back(Base + I);
     }
-    Value *SubVal = Builder.CreateShuffleVector(Val, Mask);
+
+    Value *SubVal = Val;
+    if (VecLen > 2)
+      SubVal = Builder.CreateShuffleVector(Val, Mask);
 
     Args.push_back(SubVal);
     // Create the final intrinsic call
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/BufferStoreDouble.ll
@@ -16,10 +16,8 @@ define void @storef64(double %0) {
   ; CHECK: [[Hi:%.*]] = extractvalue { i32, i32 } [[SD]], 1
   ; CHECK: [[Vec1:%.*]] = insertelement <2 x i32> poison, i32 [[Lo]], i32 0
   ; CHECK: [[Vec2:%.*]] = insertelement <2 x i32> [[Vec1]], i32 [[Hi]], i32 1
-  ; this shufflevector is unnecessary but generated to avoid specalization
-  ; CHECK: [[Vec3:%.*]] = shufflevector <2 x i32> [[Vec2]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_f64_1_0_0t.v2i32(
-  ; CHECK-SAME: target("dx.TypedBuffer", double, 1, 0, 0) [[B]], i32 0, <2 x i32> [[Vec3]])
+  ; CHECK-SAME: target("dx.TypedBuffer", double, 1, 0, 0) [[B]], i32 0, <2 x i32> [[Vec2]])
   call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", double, 1, 0, 0) %buffer, i32 0,
       double %0)
@@ -40,10 +38,8 @@ define void @storev2f64(<2 x double> %0) {
   ; CHECK: [[Lo:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[SD]], 0
   ; CHECK: [[Hi:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[SD]], 1
   ; CHECK: [[Vec:%.*]] = shufflevector <2 x i32> [[Lo]], <2 x i32> [[Hi]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  ; this shufflevector is unnecessary but generated to avoid specalization
-  ; CHECK: [[Vec2:%.*]] = shufflevector <4 x i32> [[Vec]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v4i32(
-  ; CHECK-SAME: target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[B]], i32 0, <4 x i32> [[Vec2]])
+  ; CHECK-SAME: target("dx.TypedBuffer", <2 x double>, 1, 0, 0) [[B]], i32 0, <4 x i32> [[Vec]])
   call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 0,
       <2 x double> %0)
diff --git a/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll b/llvm/test/CodeGen/DirectX/BufferStoreInt64.ll
@@ -12,9 +12,7 @@ define void @storei64(i64 %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i32 1
-; the shufflevector is unnecessary but generated to avoid too much specalization
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t.v2i32(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0, <2 x i32> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_i64_1_0_0t.v2i32(target("dx.TypedBuffer", i64, 1, 0, 0) [[BUFFER]], i32 0, <2 x i32> [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   %buffer = tail call target("dx.TypedBuffer", i64, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
@@ -31,9 +29,7 @@ define void @storev2i64(<2 x i64> %0) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP0]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; the shufflevector is unnecessary but generated to avoid too much specalization
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t.v4i32(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0, <4 x i32> [[TMP14]])
+; CHECK-NEXT:    call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2i64_1_0_0t.v4i32(target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) [[BUFFER]], i32 0, <4 x i32> [[TMP13]])
 ; CHECK-NEXT:    ret void
 ;
   %buffer = tail call target("dx.TypedBuffer", <2 x i64>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2i64_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr null)
diff --git a/llvm/test/CodeGen/DirectX/RawBufferStoreDouble.ll b/llvm/test/CodeGen/DirectX/RawBufferStoreDouble.ll
@@ -16,10 +16,8 @@ define void @storef64(double %0, i32 %index) {
   ; CHECK: [[Hi:%.*]] = extractvalue { i32, i32 } [[SD]], 1
   ; CHECK: [[Vec1:%.*]] = insertelement <2 x i32> poison, i32 [[Lo]], i32 0
   ; CHECK: [[Vec2:%.*]] = insertelement <2 x i32> [[Vec1]], i32 [[Hi]], i32 1
-  ; this shufflevector is unnecessary but generated to avoid specalization
-  ; CHECK: [[Vec3:%.*]] = shufflevector <2 x i32> [[Vec2]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
   ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_f64_1_0t.v2i32(
-  ; CHECK-SAME: target("dx.RawBuffer", double, 1, 0) [[B]], i32 %index, i32 0, <2 x i32> [[Vec3]])
+  ; CHECK-SAME: target("dx.RawBuffer", double, 1, 0) [[B]], i32 %index, i32 0, <2 x i32> [[Vec2]])
   call void @llvm.dx.resource.store.rawbuffer(
       target("dx.RawBuffer", double, 1, 0) %buffer, i32 %index, i32 0,
       double %0)
@@ -39,10 +37,8 @@ define void @storev2f64(<2 x double> %0, i32 %index) {
   ; CHECK: [[Lo:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[SD]], 0
   ; CHECK: [[Hi:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[SD]], 1
   ; CHECK: [[Vec:%.*]] = shufflevector <2 x i32> [[Lo]], <2 x i32> [[Hi]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  ; this shufflevector is unnecessary but generated to avoid specalization
-  ; CHECK: [[Vec2:%.*]] = shufflevector <4 x i32> [[Vec]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v2f64_1_0t.v4i32(
-  ; CHECK-SAME: target("dx.RawBuffer", <2 x double>, 1, 0) [[B]], i32 %index, i32 0, <4 x i32> [[Vec2]])
+  ; CHECK-SAME: target("dx.RawBuffer", <2 x double>, 1, 0) [[B]], i32 %index, i32 0, <4 x i32> [[Vec]])
   call void @llvm.dx.resource.store.rawbuffer(
       target("dx.RawBuffer", <2 x double>, 1, 0) %buffer, i32 %index, i32 0,
       <2 x double> %0)
diff --git a/llvm/test/CodeGen/DirectX/RawBufferStoreInt64.ll b/llvm/test/CodeGen/DirectX/RawBufferStoreInt64.ll
@@ -16,10 +16,8 @@ define void @storei64(i64 %0, i32 %index) {
   ; CHECK: [[C:%.*]] = trunc i64 [[B]] to i32
   ; CHECK: [[Vec1:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
   ; CHECK: [[Vec2:%.*]] = insertelement <2 x i32> [[Vec1]], i32 [[C]], i32 1
-  ; this shufflevector is unnecessary but generated to avoid specalization
-  ; CHECK: [[Vec3:%.*]] = shufflevector <2 x i32> [[Vec2]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
   ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i64_1_0t.v2i32(
-  ; CHECK-SAME: target("dx.RawBuffer", i64, 1, 0) [[Buf]], i32 %index, i32 0, <2 x i32> [[Vec3]])
+  ; CHECK-SAME: target("dx.RawBuffer", i64, 1, 0) [[Buf]], i32 %index, i32 0, <2 x i32> [[Vec2]])
   call void @llvm.dx.resource.store.rawbuffer(
       target("dx.RawBuffer", i64, 1, 0) %buffer, i32 %index, i32 0,
       i64 %0)
@@ -38,10 +36,8 @@ define void @storev2i64(<2 x i64> %0, i32 %index) {
   ; CHECK: [[B:%.*]] = lshr <2 x i64> %0, splat (i64 32)
   ; CHECK: [[C:%.*]] = trunc <2 x i64> [[B]] to <2 x i32>
   ; CHECK: [[Vec:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[C]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  ; this shufflevector is unnecessary but generated to avoid specalization
-  ; CHECK: [[Vec2:%.*]] = shufflevector <4 x i32> [[Vec]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ; CHECK: call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v2i64_1_0t.v4i32(
-  ; CHECK-SAME: target("dx.RawBuffer", <2 x i64>, 1, 0) [[Buf]], i32 %index, i32 0, <4 x i32> [[Vec2]])
+  ; CHECK-SAME: target("dx.RawBuffer", <2 x i64>, 1, 0) [[Buf]], i32 %index, i32 0, <4 x i32> [[Vec]])
   call void @llvm.dx.resource.store.rawbuffer(
       target("dx.RawBuffer", <2 x i64>, 1, 0) %buffer, i32 %index, i32 0,
       <2 x i64> %0)