Skip to content

Commit 39c1593

Browse files
ppogotovigcbot
authored andcommitted
Support get_global_linear_id in continuous memory access analysis.
Allow use of the sub instruction when analyzing continuous memory accesses.
1 parent c9572b5 commit 39c1593

File tree

5 files changed

+107
-29
lines changed

5 files changed

+107
-29
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6788,9 +6788,11 @@ uint32_t EmitPass::getReqBlkBitsForBlockStLd(CallInst *call) {
67886788
if (set.hasAttribute("alignmentrequirements")) {
67896789
llvm::Attribute attr = set.getAttribute("alignmentrequirements");
67906790
llvm::StringRef attrValue = attr.getValueAsString();
6791-
// 4-byte alignment is requied
6791+
// Get required alignment from metadata.
67926792
if (attrValue == "4") {
67936793
return 32;
6794+
} else if (attrValue == "8") {
6795+
return 64;
67946796
}
67956797
}
67966798

IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ bool GenerateBlockMemOpsPass::runOnFunction(Function &F) {
4949
return false;
5050

5151
bool Changed = false;
52-
5352
// Load / store instructions which are not in code divergence and can be optimized.
5453
SmallVector<Instruction*, 32> LoadStoreToProcess;
5554
// Load / store instructions which are inside the loop and can be optimized.
@@ -94,7 +93,6 @@ bool GenerateBlockMemOpsPass::runOnFunction(Function &F) {
9493
} else if (Loop *L = LI->getLoopFor(I.getParent())) {
9594
// In some cases IGC can't proof that there is no code divergence in the loop.
9695
// Handle these cases here.
97-
9896
// Check that the loop has been already analyzed.
9997
if (LoadStoreInLoop.find(L) == LoadStoreInLoop.end()) {
10098
if (!isLoopPattern(L))
@@ -515,12 +513,12 @@ bool GenerateBlockMemOpsPass::doesLoopHaveExternUse(Loop *L) {
515513
return false;
516514
}
517515

518-
bool GenerateBlockMemOpsPass::isAddressAligned(Value *Ptr, const alignment_t &CurrentAlignment, Type *DataType) {
516+
bool GenerateBlockMemOpsPass::isDataTypeSupported(Value *Ptr, Type *DataType) {
519517
unsigned ScalarSize = DataType->getScalarSizeInBits();
520518

521519
// The list of possible alignments should be expanded.
522520
if (CGCtx->platform.isProductChildOf(IGFX_PVC))
523-
if ((ScalarSize == 32) && (CurrentAlignment == 4))
521+
if (ScalarSize == 32 || ScalarSize == 64)
524522
return true;
525523

526524
return false;
@@ -569,17 +567,19 @@ bool GenerateBlockMemOpsPass::isIndexContinuous(Value *Indx) {
569567
}
570568
VisitedPhi = Phi;
571569
} else if (Instruction *Inst = dyn_cast<Instruction>(NonUnifOp)) {
572-
if (Inst->getOpcode() != Instruction::Add)
570+
if (Inst->getOpcode() != Instruction::Add && Inst->getOpcode() != Instruction::Sub)
573571
return false;
574572

575573
Value *Op0 = Inst->getOperand(0);
576574
Value *Op1 = Inst->getOperand(1);
577575

578-
579576
if (!WI->isUniform(Op1) && !WI->isUniform(Op0))
580577
return false;
581578

582579
if (WI->isUniform(Op0)) {
580+
if (Inst->getOpcode() == Instruction::Sub)
581+
return false;
582+
583583
NonUniformInstVector.push_back(Op1);
584584
} else {
585585
NonUniformInstVector.push_back(Op0);
@@ -629,25 +629,22 @@ bool GenerateBlockMemOpsPass::canOptLoadStore(Instruction *I) {
629629
Value *Ptr = nullptr;
630630
Value *ValOp = nullptr;
631631
Type *DataType = nullptr;
632-
alignment_t CurrentAlignment = 0;
633632

634633
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
635634
Ptr = LI->getPointerOperand();
636-
CurrentAlignment = IGCLLVM::getAlignmentValue(LI);
637635
DataType = cast<Value>(LI)->getType();
638636
} else {
639637
StoreInst* SI = cast<StoreInst>(I);
640638
Ptr = SI->getPointerOperand();
641639
ValOp = SI->getValueOperand();
642-
CurrentAlignment = IGCLLVM::getAlignmentValue(SI);
643640
DataType = ValOp->getType();
644641
}
645642

646643
if (DataType->isVectorTy())
647644
return false;
648645

649646
// Need to check what alignment block load/store requires for the specific architecture.
650-
if (!isAddressAligned(Ptr, CurrentAlignment, DataType))
647+
if (!isDataTypeSupported(Ptr, DataType))
651648
return false;
652649

653650
// Get the last index from the getelementptr instruction if it is not uniform in the subgroup.

IGC/Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class GenerateBlockMemOpsPass : public llvm::FunctionPass
4444
llvm::Value *checkGep(llvm::Instruction *Gep);
4545
bool isLocalIdX(const llvm::Value *InputVal);
4646
bool isR0(const llvm::Value *InputVal);
47-
bool isAddressAligned(llvm::Value *Ptr, const alignment_t &CurrentAlignment, llvm::Type *DataType);
47+
bool isDataTypeSupported(llvm::Value *Ptr, llvm::Type *DataType);
4848
bool isIndexContinuous(llvm::Value *Addr);
4949
bool checkVectorizationAlongX(llvm::Function *F);
5050
bool checkLoopPhiVals(llvm::Loop *L);

IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check-typed-pointers.ll

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
; RUN: igc_opt %s -S -o - -generate-block-mem-ops -platformpvc | FileCheck %s
1111

1212
define spir_kernel void @testYZUnif(float addrspace(1)* %out, float addrspace(1)* %in, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %localSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i32 %bufferOffset1) {
13+
14+
; CHECK-LABEL: @testYZUnif(
15+
1316
entry:
1417
%0 = extractelement <3 x i32> %localSize, i64 0
1518
%1 = extractelement <3 x i32> %localSize, i64 1
@@ -24,12 +27,12 @@ entry:
2427
%arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %conv.i
2528
%2 = load float, float addrspace(1)* %arrayidx, align 4
2629

27-
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %arrayidx) [[ATTR_NUM:#.*]]
30+
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %arrayidx) [[ATTR_NUM1:#.*]]
2831

2932
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv.i
3033
store float %2, float addrspace(1)* %arrayidx1, align 4
3134

32-
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %arrayidx1, float [[TMP0]]) [[ATTR_NUM]]
35+
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %arrayidx1, float [[TMP0]]) [[ATTR_NUM1]]
3336

3437
ret void
3538
}
@@ -68,11 +71,41 @@ entry:
6871

6972
}
7073

74+
; Check that 8-byte block loads/writes are supproted by the optimization.
75+
76+
define spir_kernel void @test8ByteBlockOps(double addrspace(1)* align 8 %0, double addrspace(1)* align 8 %1, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %enqueuedLocalSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i32 %bufferOffset1) {
77+
78+
; CHECK-LABEL: @test8ByteBlockOps(
79+
80+
entry:
81+
%extr1 = extractelement <8 x i32> %payloadHeader, i64 0
82+
%extr2 = extractelement <8 x i32> %r0, i64 1
83+
%shl1 = shl i32 %extr2, 5
84+
%localIdX2 = zext i16 %localIdX to i32
85+
%add1 = add i32 %shl1, %localIdX2
86+
%add2 = add i32 %add1, %extr1
87+
%z1 = zext i32 %add1 to i64
88+
%z2 = zext i32 %extr1 to i64
89+
%sub1 = sub nsw i64 %z1, %z2
90+
%gep1 = getelementptr inbounds double, double addrspace(1)* %0, i64 %sub1
91+
%ld1 = load double, double addrspace(1)* %gep1, align 8
92+
93+
; CHECK: [[TMP1:%.*]] = call double @llvm.genx.GenISA.simdBlockRead.f64.p1f64(double addrspace(1)* %gep1) [[ATTR_NUM2:#.*]]
94+
95+
%gep2 = getelementptr inbounds double, double addrspace(1)* %1, i64 %sub1
96+
store double %ld1, double addrspace(1)* %gep2, align 8
97+
98+
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f64.f64(double addrspace(1)* %gep2, double [[TMP1]]) [[ATTR_NUM2]]
99+
100+
ret void
101+
}
102+
71103
define spir_kernel void @testYZUnifLoop(float addrspace(1)* %out, float addrspace(1)* %in, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %localSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i64 %limit) {
104+
; CHECK-LABEL: @testYZUnifLoop(
72105
; CHECK: %{{.*}} = load
73106
; CHECK: store
74-
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %{{.*}}) [[ATTR_NUM]]
75-
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %{{.*}}, float [[TMP0]]) [[ATTR_NUM]]
107+
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %{{.*}}) [[ATTR_NUM1]]
108+
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %{{.*}}, float [[TMP0]]) [[ATTR_NUM1]]
76109
entry:
77110
%offset = extractelement <8 x i32> %payloadHeader, i64 0
78111
%groupNumX = extractelement <8 x i32> %r0, i64 1
@@ -100,17 +133,20 @@ terminator:
100133
ret void
101134
}
102135

103-
; CHECK: attributes #2 = { "alignmentrequirements"="4" }
136+
; CHECK: attributes [[ATTR_NUM1]] = { "alignmentrequirements"="4" }
137+
; CHECK: attributes [[ATTR_NUM2]] = { "alignmentrequirements"="8" }
104138

105-
!igc.functions = !{!1, !2, !3}
139+
!igc.functions = !{!1, !2, !3, !4}
106140
!IGCMetadata = !{!19}
107141

108142
!1 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testYZUnif, !41}
109143
!2 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testNoUnif, !42}
110144
!3 = !{void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i64)* @testYZUnifLoop, !43}
145+
!4 = !{void (double addrspace(1)*, double addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @test8ByteBlockOps, !44}
111146
!41 = !{!5, !6, !17}
112147
!42 = !{!5, !6}
113148
!43 = !{!5, !6, !17}
149+
!44 = !{!5, !6, !17}
114150
!5 = !{!"function_type", i32 0}
115151
!6 = !{!"implicit_arg_desc", !7, !8, !9, !10, !11, !12, !13, !15}
116152
!7 = !{i32 0}
@@ -125,22 +161,24 @@ terminator:
125161
!16 = !{!"explicit_arg_num", i32 1}
126162

127163
; This metadata provides information about the size of the work group.
128-
; The IGC can generate block memory instructions only if data access is contiguous across the workgroup.
164+
; IGC can generate block memory instructions only if data access is contiguous across the workgroup.
129165
; This requires that the workgroup be completely vectorized along the x-axis, in other words local_size_x % 32 == 0 (case !17).
130166

131167
!17 = !{!"thread_group_size", i32 32, i32 32, i32 32}
132168

133-
; IGC cannot apply the optimization in the !18 case because local_size_x % 32 != 0.
169+
; IGC cannot apply the optimization in the !18 case because local_size_x % 32 (simd size) != 0.
134170

135171
!18 = !{!"thread_group_size", i32 16, i32 32, i32 32}
136172
!19 = !{!"ModuleMD", !112}
137-
!112 = !{!"FuncMD", !113, !114, !333, !334, !335, !336}
173+
!112 = !{!"FuncMD", !113, !114, !333, !334, !335, !336, !337, !338}
138174
!113 = !{!"FuncMDMap[0]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testYZUnif}
139175
!114 = !{!"FuncMDValue[0]", !116}
140176
!333 = !{!"FuncMDMap[1]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @testNoUnif}
141177
!334 = !{!"FuncMDValue[1]", !116}
142178
!335 = !{!"FuncMDMap[2]", void (float addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i64)* @testYZUnifLoop}
143179
!336 = !{!"FuncMDValue[2]", !116}
180+
!337 = !{!"FuncMDMap[3]", void (double addrspace(1)*, double addrspace(1)*, <8 x i32>, <8 x i32>, <3 x i32>, i16, i16, i16, i32, i32)* @test8ByteBlockOps}
181+
!338 = !{!"FuncMDValue[3]", !116}
144182
!116 = !{!"workGroupWalkOrder", !117, !118, !119}
145183
!117 = !{!"dim0", i32 0}
146184
!118 = !{!"dim1", i32 1}

IGC/Compiler/tests/GenerateBlockMemOpsPass/block_read_write_check.ll

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
; RUN: igc_opt --opaque-pointers %s -S -o - -generate-block-mem-ops -platformpvc | FileCheck %s
1212

1313
define spir_kernel void @testYZUnif(ptr addrspace(1) %out, ptr addrspace(1) %in, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %localSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i32 %bufferOffset1) {
14+
15+
; CHECK-LABEL: @testYZUnif(
16+
1417
entry:
1518
%0 = extractelement <3 x i32> %localSize, i64 0
1619
%1 = extractelement <3 x i32> %localSize, i64 1
@@ -25,12 +28,12 @@ entry:
2528
%arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %conv.i
2629
%2 = load float, ptr addrspace(1) %arrayidx, align 4
2730

28-
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1(ptr addrspace(1) %arrayidx) [[ATTR_NUM:#.*]]
31+
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1(ptr addrspace(1) %arrayidx) [[ATTR_NUM1:#.*]]
2932

3033
%arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv.i
3134
store float %2, ptr addrspace(1) %arrayidx1, align 4
3235

33-
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1.f32(ptr addrspace(1) %arrayidx1, float [[TMP0]]) [[ATTR_NUM]]
36+
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1.f32(ptr addrspace(1) %arrayidx1, float [[TMP0]]) [[ATTR_NUM1]]
3437

3538
ret void
3639
}
@@ -65,15 +68,47 @@ entry:
6568

6669
ret void
6770

68-
; CHECK: ret void
71+
; CHECK: ret void
72+
73+
}
74+
75+
; Check that 8-byte block loads/writes are supproted by the optimization.
76+
77+
define spir_kernel void @test8ByteBlockOps(ptr addrspace(1) %0, ptr addrspace(1) %1, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %enqueuedLocalSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i32 %bufferOffset1) {
78+
79+
; CHECK-LABEL: @test8ByteBlockOps(
80+
81+
entry:
82+
%extr1 = extractelement <8 x i32> %payloadHeader, i64 0
83+
%extr2 = extractelement <8 x i32> %r0, i64 1
84+
%shl1 = shl i32 %extr2, 5
85+
%localIdX2 = zext i16 %localIdX to i32
86+
%add1 = add i32 %shl1, %localIdX2
87+
%add2 = add i32 %add1, %extr1
88+
%z1 = zext i32 %add1 to i64
89+
%z2 = zext i32 %extr1 to i64
90+
%sub1 = sub nsw i64 %z1, %z2
91+
%gep1 = getelementptr inbounds double, ptr addrspace(1) %0, i64 %sub1
92+
%ld1 = load double, ptr addrspace(1) %gep1, align 8
93+
94+
; CHECK: [[TMP1:%.*]] = call double @llvm.genx.GenISA.simdBlockRead.f64.p1(ptr addrspace(1) %gep1) [[ATTR_NUM2:#.*]]
95+
96+
%gep2 = getelementptr inbounds double, ptr addrspace(1) %1, i64 %sub1
97+
store double %ld1, ptr addrspace(1) %gep2, align 8
98+
99+
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1.f64(ptr addrspace(1) %gep2, double [[TMP1]]) [[ATTR_NUM2]]
69100

101+
ret void
70102
}
71103

72104
define spir_kernel void @testYZUnifLoop(ptr addrspace(1) %out, ptr addrspace(1) %in, <8 x i32> %r0, <8 x i32> %payloadHeader, <3 x i32> %localSize, i16 %localIdX, i16 %localIdY, i16 %localIdZ, i32 %bufferOffset, i64 %limit) {
105+
106+
; CHECK-LABEL: @testYZUnifLoop(
73107
; CHECK: %{{.*}} = load
74108
; CHECK: store
75-
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1(ptr addrspace(1) %{{.*}}) [[ATTR_NUM]]
76-
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1.f32(ptr addrspace(1) %{{.*}}, float [[TMP0]]) [[ATTR_NUM]]
109+
; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1(ptr addrspace(1) %{{.*}}) [[ATTR_NUM1]]
110+
; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1.f32(ptr addrspace(1) %{{.*}}, float [[TMP0]]) [[ATTR_NUM1]]
111+
77112
entry:
78113
%offset = extractelement <8 x i32> %payloadHeader, i64 0
79114
%groupNumX = extractelement <8 x i32> %r0, i64 1
@@ -101,17 +136,21 @@ terminator:
101136
ret void
102137
}
103138

104-
; CHECK: attributes #2 = { "alignmentrequirements"="4" }
139+
; CHECK: attributes [[ATTR_NUM1]] = { "alignmentrequirements"="4" }
140+
; CHECK: attributes [[ATTR_NUM2]] = { "alignmentrequirements"="8" }
141+
105142

106-
!igc.functions = !{!1, !2, !3}
143+
!igc.functions = !{!1, !2, !3, !4}
107144
!IGCMetadata = !{!19}
108145

109146
!1 = !{ptr @testYZUnif, !41}
110147
!2 = !{ptr @testNoUnif, !42}
111148
!3 = !{ptr @testYZUnifLoop, !43}
149+
!4 = !{ptr @test8ByteBlockOps, !44}
112150
!41 = !{!5, !6, !17}
113151
!42 = !{!5, !6}
114152
!43 = !{!5, !6, !17}
153+
!44 = !{!5, !6, !17}
115154
!5 = !{!"function_type", i32 0}
116155
!6 = !{!"implicit_arg_desc", !7, !8, !9, !10, !11, !12, !13, !15}
117156
!7 = !{i32 0}
@@ -135,13 +174,15 @@ terminator:
135174

136175
!18 = !{!"thread_group_size", i32 16, i32 32, i32 32}
137176
!19 = !{!"ModuleMD", !112}
138-
!112 = !{!"FuncMD", !113, !114, !333, !334, !335, !336}
177+
!112 = !{!"FuncMD", !113, !114, !333, !334, !335, !336, !337, !338}
139178
!113 = !{!"FuncMDMap[0]", ptr @testYZUnif}
140179
!114 = !{!"FuncMDValue[0]", !116}
141180
!333 = !{!"FuncMDMap[1]", ptr @testNoUnif}
142181
!334 = !{!"FuncMDValue[1]", !116}
143182
!335 = !{!"FuncMDMap[2]", ptr @testYZUnifLoop}
144183
!336 = !{!"FuncMDValue[2]", !116}
184+
!337 = !{!"FuncMDMap[3]", ptr @test8ByteBlockOps}
185+
!338 = !{!"FuncMDValue[3]", !116}
145186
!116 = !{!"workGroupWalkOrder", !117, !118, !119}
146187
!117 = !{!"dim0", i32 0}
147188
!118 = !{!"dim1", i32 1}

0 commit comments

Comments
 (0)