Merge branch 'main' into sg_distr_minor_fixes

charithaintc · charithaintc · commit 78b6393bd292 · 2025-07-17T21:57:02.000Z
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
@@ -718,6 +718,7 @@ mlgo:
   - llvm/lib/Analysis/models/**
   - llvm/test/Analysis/IR2Vec/**
   - llvm/tools/llvm-ir2vec/**
+  - llvm/docs/CommandGuide/llvm-ir2vec.rst
 
 tools:llvm-exegesis:
   - llvm/tools/llvm-exegesis/**
diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
@@ -2,7 +2,7 @@
 # when target.max-children-depth wasn't explicitly set.
 
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clang_host -g %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=DWIM
 #
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2947,13 +2947,6 @@ multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
   defm _SADDR : VFLAT_Real_gfx12<op, name>;
 }
 
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
-                                      string name = get_FLAT_ps<NAME>.Mnemonic,
-                                      string alias = name> :
-  VFLAT_Real_Base_gfx12<op, name, alias> {
-  defm _SADDR : VFLAT_Real_gfx12<op, name>;
-}
-
 multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
   let AssemblerPredicate = isGFX12Not12_50 in {
     defm "" : VFLAT_Real_gfx12<op>;
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Info.RootFlattenedArrayType, Info.RootPointerOperand,
         {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags());
 
+    // If the pointer operand is a global variable and all indices are 0,
+    // IRBuilder::CreateGEP will return the global variable instead of creating
+    // a GEP instruction or GEP ConstantExpr. In this case we have to create and
+    // insert our own GEP instruction.
+    if (!isa<GEPOperator>(NewGEP))
+      NewGEP = GetElementPtrInst::Create(
+          Info.RootFlattenedArrayType, Info.RootPointerOperand,
+          {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(),
+          Builder.GetInsertPoint());
+
     // Replace the current GEP with the new GEP. Store GEPInfo into the map
     // for later use in case this GEP was not the end of the chain
     GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)});
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I,
 }
 
 static void
-legalizeLoadStoreOnArrayAllocas(Instruction &I,
+legalizeScalarLoadStoreOnArrays(Instruction &I,
                                 SmallVectorImpl<Instruction *> &ToRemove,
                                 DenseMap<Value *, Value *> &) {
 
@@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I,
   } else
     return;
 
-  assert(LoadStoreTy->isSingleValueType() &&
-         "Expected load/store type to be a single-valued type");
+  // If the load/store is not of a single-value type (i.e., scalar or vector)
+  // then we do not modify it. It shouldn't be a vector either because the
+  // dxil-data-scalarization pass is expected to run before this, but it's not
+  // incorrect to apply this transformation to vector load/stores.
+  if (!LoadStoreTy->isSingleValueType())
+    return;
 
-  auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp);
-  if (!AllocaPtrOp)
+  Type *ArrayTy;
+  if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
+    ArrayTy = GlobalVarPtrOp->getValueType();
+  else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
+    ArrayTy = AllocaPtrOp->getAllocatedType();
+  else
     return;
 
-  Type *Ty = AllocaPtrOp->getAllocatedType();
-  if (!isa<ArrayType>(Ty))
+  if (!isa<ArrayType>(ArrayTy))
     return;
-  assert(!isa<ArrayType>(Ty->getArrayElementType()) &&
-         "Expected allocated type of AllocaInst to be a flat ArrayType");
 
-  IRBuilder<> Builder(&I);
-  Value *Zero = Builder.getInt32(0);
-  Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "",
-                                 GEPNoWrapFlags::all());
+  assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
+         "Expected array element type to be the same as to the scalar load or "
+         "store type");
+
+  Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0);
+  Value *GEP = GetElementPtrInst::Create(
+      ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
   I.setOperand(PtrOpIndex, GEP);
 }
 
@@ -651,7 +659,7 @@ class DXILLegalizationPipeline {
     // downcastI64toI32InsertExtractElements needs to handle.
     LegalizationPipeline[Stage2].push_back(
         downcastI64toI32InsertExtractElements);
-    LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas);
+    LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays);
   }
 };
 
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -218,6 +218,28 @@ define void @two_index_gep_const() {
   ret void
 }
 
+define void @zero_index_global() {
+  ; CHECK-LABEL: define void @zero_index_global(
+  ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 0
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0
+  %2 = load float, ptr addrspace(3) %1, align 4
+  ret void
+}
+
+; Note: A ConstantExpr GEP with all 0 indices is equivalent to the pointer
+; operand of the GEP. Therefore the visitLoadInst will not see the pointer operand
+; as a ConstantExpr GEP and will not create a GEP instruction to be visited.
+; The later dxil-legalize pass will insert a GEP in this instance.
+define void @zero_index_global_const() {
+  ; CHECK-LABEL: define void @zero_index_global_const(
+  ; CHECK-NEXT: load float, ptr addrspace(3) @g.1dim, align 4
+  ; CHECK-NEXT: ret void
+  %1 = load float, ptr addrspace(3) getelementptr inbounds nuw ([2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0), align 4
+  ret void
+}
+
 define void @gep_4d_index_test()  {
     ; CHECK-LABEL: gep_4d_index_test
     ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
diff --git a/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll b/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll
@@ -21,3 +21,21 @@ define void @store() {
   store i32 0, ptr %a, align 4
   ret void
 }
+
+@g = local_unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
+define void @load_whole_global () {
+; CHECK-LABEL: define void @load_whole_global
+; CHECK-NEXT:    load [4 x i32], ptr addrspace(3) @g, align 4
+; CHECK-NEXT:    ret void
+  %l = load [4 x i32], ptr addrspace(3) @g, align 4
+  ret void
+}
+
+define void @load_global_index0 () {
+; CHECK-LABEL: define void @load_global_index0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @g, i32 0, i32 0
+; CHECK-NEXT:    load i32, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    ret void
+  %l = load i32, ptr addrspace(3) @g, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -24,7 +24,8 @@
 define <4 x i32> @load_array_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_array_vec_test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 4
@@ -52,7 +53,8 @@ define <4 x i32> @load_array_vec_test() #0 {
 define <4 x i32> @load_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_vec_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 2), align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 3), align 4
@@ -203,7 +205,8 @@ define <4 x i32> @load_static_array_of_vec_from_i8_gep_test(i32 %index) #0 {
 define <4 x i32> @multid_load_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @multid_load_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 2), align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 3), align 4
diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll
@@ -14,7 +14,8 @@
 
 ; CHECK-LABEL: store_array_vec_test
 define void @store_array_vec_test () local_unnamed_addr #0 {
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 16
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(3) [[GEP]], align 16
 ; CHECK-NEXT:    store float 2.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
 ; CHECK-NEXT:    store float 3.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 8
 ; CHECK-NEXT:    store float 2.000000e+00, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 16
@@ -30,7 +31,8 @@ define void @store_array_vec_test () local_unnamed_addr #0 {
 ; CHECK-LABEL: store_vec_test
 define void @store_vec_test(<4 x i32> %inputVec) #0 {
 ; CHECK-NEXT:    [[INPUTVEC_I01:%.*]] = extractelement <4 x i32> %inputVec, i32 0
-; CHECK-NEXT:    store i32 [[INPUTVEC_I01]], ptr addrspace(3) @vecData.scalarized, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
+; CHECK-NEXT:    store i32 [[INPUTVEC_I01]], ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[INPUTVEC_I12:%.*]] = extractelement <4 x i32> %inputVec, i32 1
 ; CHECK-NEXT:    store i32 [[INPUTVEC_I12]], ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[INPUTVEC_I23:%.*]] = extractelement <4 x i32> %inputVec, i32 2
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp
   let summary = "Extend a vector of packed floating point values";
 
   let description = [{
-    Extend and scale two packed floats in `source[index]` to two floats and 
+    Extend and scale two packed floats in `source[index]` to two floats and
     return them.
 
     This rather unusual signature arises from the fact that AMD GPUs cannot
@@ -861,7 +861,7 @@ def AMDGPU_WMMAOp :
 }
 
 def AMDGPU_GatherToLDSOp :
-    AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
+    AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
     Arguments<(ins
                    Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
                    Variadic<Index>:$srcIndices,
@@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp :
     order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
 
     This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
-    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and 
-    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile 
-    size. 
-    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` 
+    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
+    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
+    size.
+    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
     are omitted from this wrapper.
-    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for 
-    double-precision operations on gfx94x and so are not included here. 
+    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
+    double-precision operations on gfx94x and so are not included here.
   }];
   let assemblyFormat = [{
     `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -134,6 +134,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 3;
   if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
@@ -142,6 +144,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 7;
   if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
   // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
   // CHECK: %[[ALLOC:.*]] = memref.alloc()
   // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
   // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
   // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
-  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
+  // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64
+  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]]
   // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
   %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
   %c0 = arith.constant 0 : index
@@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
   // CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
 
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
+  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
   // CHECK: %[[C32:.*]] = arith.constant 32 : index
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6>
   func.return %0 : vector<8xi6>
 }
+
+// -----
+
+func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) {
+  // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}}
+  amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
+  func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -493,3 +493,14 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16>
   func.return %0 : vector<4xf16>
 }
+
+// CHECK-LABEL: func @gather_to_lds
+func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) {
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}],          %{{.*}}[%{{.*}}, %{{.*}}]
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1]        : vector<2xf16>, memref<32x32xf16>, memref<32xf16,    #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem1[%idx1],        %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>,    memref<32x32xf16, #gpu.address_space<workgroup>>
+  func.return
+}

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`# when target.max-children-depth wasn't explicitly set.`
`3`	`3`
`4`	`4`	`# RUN: split-file %s %t`
`5`		`-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out`
	`5`	`+# RUN: %clang_host -g %t/main.cpp -o %t.out`
`6`	`6`	`# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \`
`7`	`7`	`# RUN: \| FileCheck %s --check-prefix=DWIM`
`8`	`8`	`#`