GenerateBlockMemOpsPass: Fix handling geps that don't match load/store result type

karolzwolak · igcbot · commit 93f6411c243c · 2025-11-25T10:03:20.000+01:00
Prior to this change, these two geps were handled differently even though they point to the same address.
```llvm
%struct.foo = type { i32, i32, i32 }

; simdBlockWrite was incorrectly generated for this gep
; ptr to the whole struct
%mismatch = getelementptr %struct.foo, ptr addrspace(1) %data, i64 %idx
; but we store to the first field
store i32 0, ptr addrspace(1) %mismatch, align 4

; but here there was no simdBlockWrite generated
; ptr to the first field
%field = getelementptr %struct.foo, ptr addrspace(1) %data, i64 %idx, i32 0
store i32 0, ptr addrspace(1) %field, align 4
```
diff --git a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp
@@ -647,7 +647,7 @@ bool GenerateBlockMemOpsPass::canOptLoadStore(Instruction *I) {
 
   // Get the last index from the getelementptr instruction if it is not uniform in the subgroup.
   Instruction *PtrInstr = dyn_cast<Instruction>(Ptr);
-  Value *Idx = checkGep(PtrInstr);
+  Value *Idx = checkGep(PtrInstr, DataType);
 
   if (!Idx)
     return false;
@@ -716,7 +716,7 @@ void GenerateBlockMemOpsPass::setAlignmentAttr(CallInst *CI, const unsigned &Ali
   CI->addFnAttr(CustomAttr);
 }
 
-Value *GenerateBlockMemOpsPass::checkGep(Instruction *PtrInstr) {
+Value *GenerateBlockMemOpsPass::checkGep(Instruction *PtrInstr, Type *DataType) {
   if (!PtrInstr)
     return nullptr;
 
@@ -755,19 +755,26 @@ Value *GenerateBlockMemOpsPass::checkGep(Instruction *PtrInstr) {
   if (WI->isUniform(Ptr))
     IsPtrUniform = true;
 
+  bool TypesMatch = DataType == Gep->getResultElementType();
+  Type *Int32Ty = Type::getInt32Ty(*CGCtx->getLLVMContext());
+  Value *Zero = Constant::getNullValue(Int32Ty);
+
+  // If `DataType` doesn't match the GEP result type -- then logically there are implicit zero indices at the end.
+  // Here it doesn't matter how many zero indices there are.
+  // If there's at least one implicit zero -- then we have to check all the indexes and the last index will be zero.
+  auto E = TypesMatch ? Gep->idx_end() - 1 : Gep->idx_end();
+  Value *LInst = TypesMatch ? *E : Zero;
   // Make sure that all indexes, not including the last one, are uniform.
   // This is important because the address must be continuous in the subgroup.
-  for (auto Idx = Gep->idx_begin(), E = Gep->idx_end() - 1; Idx != E; Idx++)
+  for (auto Idx = Gep->idx_begin(); Idx != E; Idx++)
     if (!WI->isUniform(*Idx))
       return nullptr;
 
-  auto LIndx = Gep->idx_end() - 1;
-
-  if (WI->isUniform(*LIndx))
+  if (WI->isUniform(LInst))
     IsLastIndUniform = true;
 
   if (!IsLastIndUniform && IsPtrUniform) {
-    return *LIndx;
+    return LInst;
   } else if (IsLastIndUniform && !IsPtrUniform) {
     if (!isa<PHINode>(Ptr) && !isa<GetElementPtrInst>(Ptr))
       return nullptr;
@@ -803,8 +810,8 @@ Value *GenerateBlockMemOpsPass::checkGep(Instruction *PtrInstr) {
       }
     }
 
-    return checkGep(dyn_cast<GetElementPtrInst>(Ptr));
+    return checkGep(dyn_cast<GetElementPtrInst>(Ptr), DataType);
   }
 
   return nullptr;
-}
+}
diff --git a/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp b/IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp
@@ -38,7 +38,7 @@ class GenerateBlockMemOpsPass : public llvm::FunctionPass {
   virtual bool runOnFunction(llvm::Function &F) override;
 
 private:
-  llvm::Value *checkGep(llvm::Instruction *Gep);
+  llvm::Value *checkGep(llvm::Instruction *Gep, llvm::Type *DataType);
   bool isLocalIdX(const llvm::Value *InputVal);
   bool isR0(const llvm::Value *InputVal);
   bool isDataTypeSupported(llvm::Value *Ptr, llvm::Type *DataType);
@@ -60,4 +60,4 @@ class GenerateBlockMemOpsPass : public llvm::FunctionPass {
   llvm::ScalarEvolution *SE;
   size_t SimdSize = 0;
 };
-} // namespace IGC
+} // namespace IGC
diff --git a/IGC/Compiler/tests/GenerateBlockMemOpsPass/gep_store_type_mismatch.ll b/IGC/Compiler/tests/GenerateBlockMemOpsPass/gep_store_type_mismatch.ll
@@ -0,0 +1,49 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2025 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; REQUIRES: llvm-16-plus
+; RUN: igc_opt %s --opaque-pointers --platformpvc --generate-block-mem-ops -S --regkey EnableOpaquePointersBackend=1 | FileCheck %s
+; CHECK-NOT: call void @llvm.genx.GenISA.simdBlockWrite
+
+; Make sure that the gep (arrayidx) whose result type (%struct.work_size_data) doesn't match the store type (i32)
+; behaves like the gep (arrayidx2) matching the type and they both don't generate simdBlockWrite instruction.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
+target triple = "spir64-unknown-unknown"
+
+%struct.work_size_data = type { i32, i32, i32 }
+
+define spir_kernel void @foo(ptr addrspace(1) %data, <8 x i32> %r0, <3 x i32> %globalOffset, <3 x i32> %globalSize, <3 x i32> %localSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, ptr %privateBase, i32 %bufferOffset) {
+entry:
+  %idxprom = zext i16 %localIdX to i64
+  %arrayidx = getelementptr %struct.work_size_data, ptr addrspace(1) %data, i64 %idxprom
+  store i32 0, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr %struct.work_size_data, ptr addrspace(1) %data, i64 %idxprom, i32 0
+  store i32 0, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+!igc.functions = !{!0}
+
+!0 = !{ptr @foo, !1}
+!1 = !{!2, !3, !15}
+!2 = !{!"function_type", i32 0}
+!3 = !{!"implicit_arg_desc", !4, !5, !6, !7, !8, !9, !10, !11, !12, !13}
+!4 = !{i32 0}
+!5 = !{i32 2}
+!6 = !{i32 5}
+!7 = !{i32 6}
+!8 = !{i32 7}
+!9 = !{i32 8}
+!10 = !{i32 9}
+!11 = !{i32 10}
+!12 = !{i32 13}
+!13 = !{i32 15, !14}
+!14 = !{!"explicit_arg_num", i32 0}
+!15 = !{!"thread_group_size", i32 64, i32 1, i32 1}
+