1010; RUN: igc_opt %s -S -o - -generate-block-mem-ops -platformpvc | FileCheck %s
1111
1212define spir_kernel void @testYZUnif (float addrspace (1 )* %out , float addrspace (1 )* %in , <8 x i32 > %r0 , <8 x i32 > %payloadHeader , <3 x i32 > %localSize , i16 %localIdX , i16 %localIdY , i16 %localIdZ , i32 %bufferOffset , i32 %bufferOffset1 ) {
13+
14+ ; CHECK-LABEL: @testYZUnif(
15+
1316entry:
1417 %0 = extractelement <3 x i32 > %localSize , i64 0
1518 %1 = extractelement <3 x i32 > %localSize , i64 1
@@ -24,12 +27,12 @@ entry:
2427 %arrayidx = getelementptr inbounds float , float addrspace (1 )* %in , i64 %conv.i
2528 %2 = load float , float addrspace (1 )* %arrayidx , align 4
2629
27- ; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %arrayidx) [[ATTR_NUM :#.*]]
30+ ; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %arrayidx) [[ATTR_NUM1 :#.*]]
2831
2932 %arrayidx1 = getelementptr inbounds float , float addrspace (1 )* %out , i64 %conv.i
3033 store float %2 , float addrspace (1 )* %arrayidx1 , align 4
3134
32- ; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %arrayidx1, float [[TMP0]]) [[ATTR_NUM ]]
35+ ; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %arrayidx1, float [[TMP0]]) [[ATTR_NUM1 ]]
3336
3437 ret void
3538}
@@ -68,11 +71,41 @@ entry:
6871
6972}
7073
74+ ; Check that 8-byte block loads/writes are supproted by the optimization.
75+
76+ define spir_kernel void @test8ByteBlockOps (double addrspace (1 )* align 8 %0 , double addrspace (1 )* align 8 %1 , <8 x i32 > %r0 , <8 x i32 > %payloadHeader , <3 x i32 > %enqueuedLocalSize , i16 %localIdX , i16 %localIdY , i16 %localIdZ , i32 %bufferOffset , i32 %bufferOffset1 ) {
77+
78+ ; CHECK-LABEL: @test8ByteBlockOps(
79+
80+ entry:
81+ %extr1 = extractelement <8 x i32 > %payloadHeader , i64 0
82+ %extr2 = extractelement <8 x i32 > %r0 , i64 1
83+ %shl1 = shl i32 %extr2 , 5
84+ %localIdX2 = zext i16 %localIdX to i32
85+ %add1 = add i32 %shl1 , %localIdX2
86+ %add2 = add i32 %add1 , %extr1
87+ %z1 = zext i32 %add1 to i64
88+ %z2 = zext i32 %extr1 to i64
89+ %sub1 = sub nsw i64 %z1 , %z2
90+ %gep1 = getelementptr inbounds double , double addrspace (1 )* %0 , i64 %sub1
91+ %ld1 = load double , double addrspace (1 )* %gep1 , align 8
92+
93+ ; CHECK: [[TMP1:%.*]] = call double @llvm.genx.GenISA.simdBlockRead.f64.p1f64(double addrspace(1)* %gep1) [[ATTR_NUM2:#.*]]
94+
95+ %gep2 = getelementptr inbounds double , double addrspace (1 )* %1 , i64 %sub1
96+ store double %ld1 , double addrspace (1 )* %gep2 , align 8
97+
98+ ; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f64.f64(double addrspace(1)* %gep2, double [[TMP1]]) [[ATTR_NUM2]]
99+
100+ ret void
101+ }
102+
71103define spir_kernel void @testYZUnifLoop (float addrspace (1 )* %out , float addrspace (1 )* %in , <8 x i32 > %r0 , <8 x i32 > %payloadHeader , <3 x i32 > %localSize , i16 %localIdX , i16 %localIdY , i16 %localIdZ , i32 %bufferOffset , i64 %limit ) {
104+ ; CHECK-LABEL: @testYZUnifLoop(
72105; CHECK: %{{.*}} = load
73106; CHECK: store
74- ; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %{{.*}}) [[ATTR_NUM ]]
75- ; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %{{.*}}, float [[TMP0]]) [[ATTR_NUM ]]
107+ ; CHECK: [[TMP0:%.*]] = call float @llvm.genx.GenISA.simdBlockRead.f32.p1f32(float addrspace(1)* %{{.*}}) [[ATTR_NUM1 ]]
108+ ; CHECK: call void @llvm.genx.GenISA.simdBlockWrite.p1f32.f32(float addrspace(1)* %{{.*}}, float [[TMP0]]) [[ATTR_NUM1 ]]
76109entry:
77110 %offset = extractelement <8 x i32 > %payloadHeader , i64 0
78111 %groupNumX = extractelement <8 x i32 > %r0 , i64 1
@@ -100,17 +133,20 @@ terminator:
100133 ret void
101134}
102135
103- ; CHECK: attributes #2 = { "alignmentrequirements"="4" }
136+ ; CHECK: attributes [[ATTR_NUM1]] = { "alignmentrequirements"="4" }
137+ ; CHECK: attributes [[ATTR_NUM2]] = { "alignmentrequirements"="8" }
104138
105- !igc.functions = !{!1 , !2 , !3 }
139+ !igc.functions = !{!1 , !2 , !3 , !4 }
106140!IGCMetadata = !{!19 }
107141
108142!1 = !{void (float addrspace (1 )*, float addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i32 )* @testYZUnif , !41 }
109143!2 = !{void (float addrspace (1 )*, float addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i32 )* @testNoUnif , !42 }
110144!3 = !{void (float addrspace (1 )*, float addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i64 )* @testYZUnifLoop , !43 }
145+ !4 = !{void (double addrspace (1 )*, double addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i32 )* @test8ByteBlockOps , !44 }
111146!41 = !{!5 , !6 , !17 }
112147!42 = !{!5 , !6 }
113148!43 = !{!5 , !6 , !17 }
149+ !44 = !{!5 , !6 , !17 }
114150!5 = !{!"function_type" , i32 0 }
115151!6 = !{!"implicit_arg_desc" , !7 , !8 , !9 , !10 , !11 , !12 , !13 , !15 }
116152!7 = !{i32 0 }
@@ -125,22 +161,24 @@ terminator:
125161!16 = !{!"explicit_arg_num" , i32 1 }
126162
127163; This metadata provides information about the size of the work group.
128- ; The IGC can generate block memory instructions only if data access is contiguous across the workgroup.
164+ ; IGC can generate block memory instructions only if data access is contiguous across the workgroup.
129165; This requires that the workgroup be completely vectorized along the x-axis, in other words local_size_x % 32 == 0 (case !17).
130166
131167!17 = !{!"thread_group_size" , i32 32 , i32 32 , i32 32 }
132168
133- ; IGC cannot apply the optimization in the !18 case because local_size_x % 32 != 0.
169+ ; IGC cannot apply the optimization in the !18 case because local_size_x % 32 (simd size) != 0.
134170
135171!18 = !{!"thread_group_size" , i32 16 , i32 32 , i32 32 }
136172!19 = !{!"ModuleMD" , !112 }
137- !112 = !{!"FuncMD" , !113 , !114 , !333 , !334 , !335 , !336 }
173+ !112 = !{!"FuncMD" , !113 , !114 , !333 , !334 , !335 , !336 , !337 , !338 }
138174!113 = !{!"FuncMDMap[0]" , void (float addrspace (1 )*, float addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i32 )* @testYZUnif }
139175!114 = !{!"FuncMDValue[0]" , !116 }
140176!333 = !{!"FuncMDMap[1]" , void (float addrspace (1 )*, float addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i32 )* @testNoUnif }
141177!334 = !{!"FuncMDValue[1]" , !116 }
142178!335 = !{!"FuncMDMap[2]" , void (float addrspace (1 )*, float addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i64 )* @testYZUnifLoop }
143179!336 = !{!"FuncMDValue[2]" , !116 }
180+ !337 = !{!"FuncMDMap[3]" , void (double addrspace (1 )*, double addrspace (1 )*, <8 x i32 >, <8 x i32 >, <3 x i32 >, i16 , i16 , i16 , i32 , i32 )* @test8ByteBlockOps }
181+ !338 = !{!"FuncMDValue[3]" , !116 }
144182!116 = !{!"workGroupWalkOrder" , !117 , !118 , !119 }
145183!117 = !{!"dim0" , i32 0 }
146184!118 = !{!"dim1" , i32 1 }
0 commit comments