Skip to content

Commit ce7f572

Browse files
committed
[AMDGPU] Simplify dpp builtin handling
DPP intrinsics can handle any type now, so no need to cast to integer. The caveat is that intrinsics only handle backend legal types, but it does not work with i8 for example.
1 parent dc5e6fe commit ce7f572

File tree

3 files changed

+38
-75
lines changed

3 files changed

+38
-75
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19193,37 +19193,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1919319193
getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
1919419194
assert(Error == ASTContext::GE_None && "Should not codegen an error");
1919519195
llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
19196-
unsigned Size = DataTy->getPrimitiveSizeInBits();
19197-
llvm::Type *IntTy =
19198-
llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
1919919196
Function *F =
1920019197
CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
1920119198
? Intrinsic::amdgcn_mov_dpp8
1920219199
: Intrinsic::amdgcn_update_dpp,
19203-
IntTy);
19200+
DataTy);
1920419201
assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
1920519202
E->getNumArgs() == 2);
1920619203
bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
1920719204
if (InsertOld)
19208-
Args.push_back(llvm::PoisonValue::get(IntTy));
19209-
for (unsigned I = 0; I != E->getNumArgs(); ++I) {
19205+
Args.push_back(llvm::PoisonValue::get(DataTy));
19206+
Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, 0, E));
19207+
for (unsigned I = 1; I != E->getNumArgs(); ++I) {
1921019208
llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
19211-
if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
19212-
Size < 32) {
19213-
if (!DataTy->isIntegerTy())
19214-
V = Builder.CreateBitCast(
19215-
V, llvm::IntegerType::get(Builder.getContext(), Size));
19216-
V = Builder.CreateZExtOrBitCast(V, IntTy);
19217-
}
1921819209
llvm::Type *ExpTy =
1921919210
F->getFunctionType()->getFunctionParamType(I + InsertOld);
1922019211
Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
1922119212
}
19222-
Value *V = Builder.CreateCall(F, Args);
19223-
if (Size < 32 && !DataTy->isIntegerTy())
19224-
V = Builder.CreateTrunc(
19225-
V, llvm::IntegerType::get(Builder.getContext(), Size));
19226-
return Builder.CreateTruncOrBitCast(V, DataTy);
19213+
return Builder.CreateCall(F, Args);
1922719214
}
1922819215
case AMDGPU::BI__builtin_amdgcn_permlane16:
1922919216
case AMDGPU::BI__builtin_amdgcn_permlanex16:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,45 +36,37 @@ void test_mov_dpp8_long(global long* out, long a) {
3636
}
3737

3838
// CHECK-LABEL: @test_mov_dpp8_float(
39-
// CHECK: %0 = bitcast float %a to i32
40-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
41-
// CHECK-NEXT: store i32 %1,
39+
// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.mov.dpp8.f32(float %a, i32 1)
40+
// CHECK-NEXT: store float %0,
4241
void test_mov_dpp8_float(global float* out, float a) {
4342
*out = __builtin_amdgcn_mov_dpp8(a, 1);
4443
}
4544

4645
// CHECK-LABEL: @test_mov_dpp8_double
47-
// CHECK: %0 = bitcast double %x to i64
48-
// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
49-
// CHECK-NEXT: store i64 %1,
46+
// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.mov.dpp8.f64(double %x, i32 1)
47+
// CHECK-NEXT: store double %0,
5048
void test_mov_dpp8_double(double x, global double *p) {
5149
*p = __builtin_amdgcn_mov_dpp8(x, 1);
5250
}
5351

5452
// CHECK-LABEL: @test_mov_dpp8_short
55-
// CHECK: %0 = zext i16 %x to i32
56-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
57-
// CHECK-NEXT: %2 = trunc i32 %1 to i16
58-
// CHECK-NEXT: store i16 %2,
53+
// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.mov.dpp8.i16(i16 %x, i32 1)
54+
// CHECK-NEXT: store i16 %0,
5955
void test_mov_dpp8_short(short x, global short *p) {
6056
*p = __builtin_amdgcn_mov_dpp8(x, 1);
6157
}
6258

6359
// CHECK-LABEL: @test_mov_dpp8_char
64-
// CHECK: %0 = zext i8 %x to i32
65-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
66-
// CHECK-NEXT: %2 = trunc i32 %1 to i8
67-
// CHECK-NEXT: store i8 %2,
60+
// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.mov.dpp8.i8(i8 %x, i32 1)
61+
// CHECK-NEXT: store i8 %0,
6862
void test_mov_dpp8_char(char x, global char *p) {
6963
*p = __builtin_amdgcn_mov_dpp8(x, 1);
7064
}
7165

7266
// CHECK-LABEL: @test_mov_dpp8_half
73-
// CHECK: %0 = load i16,
74-
// CHECK: %1 = zext i16 %0 to i32
75-
// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
76-
// CHECK-NEXT: %3 = trunc i32 %2 to i16
77-
// CHECK-NEXT: store i16 %3,
67+
// CHECK: %0 = load half,
68+
// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.mov.dpp8.f16(half %0, i32 1)
69+
// CHECK-NEXT: store half %1,
7870
void test_mov_dpp8_half(half *x, global half *p) {
7971
*p = __builtin_amdgcn_mov_dpp8(*x, 1);
8072
}

clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl

Lines changed: 22 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -117,45 +117,37 @@ void test_mov_dpp_long(long x, global long *p) {
117117
}
118118

119119
// CHECK-LABEL: @test_mov_dpp_float
120-
// CHECK: %0 = bitcast float %x to i32
121-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
122-
// CHECK-NEXT: store i32 %1,
120+
// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float poison, float %x, i32 257, i32 15, i32 15, i1 false)
121+
// CHECK-NEXT: store float %0,
123122
void test_mov_dpp_float(float x, global float *p) {
124123
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
125124
}
126125

127126
// CHECK-LABEL: @test_mov_dpp_double
128-
// CHECK: %0 = bitcast double %x to i64
129-
// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
130-
// CHECK-NEXT: store i64 %1,
127+
// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double poison, double %x, i32 257, i32 15, i32 15, i1 false)
128+
// CHECK-NEXT: store double %0,
131129
void test_mov_dpp_double(double x, global double *p) {
132130
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
133131
}
134132

135133
// CHECK-LABEL: @test_mov_dpp_short
136-
// CHECK: %0 = zext i16 %x to i32
137-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
138-
// CHECK-NEXT: %2 = trunc i32 %1 to i16
139-
// CHECK-NEXT: store i16 %2,
134+
// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 poison, i16 %x, i32 257, i32 15, i32 15, i1 false)
135+
// CHECK-NEXT: store i16 %0,
140136
void test_mov_dpp_short(short x, global short *p) {
141137
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
142138
}
143139

144140
// CHECK-LABEL: @test_mov_dpp_char
145-
// CHECK: %0 = zext i8 %x to i32
146-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
147-
// CHECK-NEXT: %2 = trunc i32 %1 to i8
148-
// CHECK-NEXT: store i8 %2,
141+
// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 poison, i8 %x, i32 257, i32 15, i32 15, i1 false)
142+
// CHECK-NEXT: store i8 %0,
149143
void test_mov_dpp_char(char x, global char *p) {
150144
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
151145
}
152146

153147
// CHECK-LABEL: @test_mov_dpp_half
154-
// CHECK: %0 = load i16,
155-
// CHECK: %1 = zext i16 %0 to i32
156-
// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
157-
// CHECK-NEXT: %3 = trunc i32 %2 to i16
158-
// CHECK-NEXT: store i16 %3,
148+
// CHECK: %0 = load half,
149+
// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half poison, half %0, i32 257, i32 15, i32 15, i1 false)
150+
// CHECK-NEXT: store half %1,
159151
void test_mov_dpp_half(half *x, global half *p) {
160152
*p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
161153
}
@@ -175,45 +167,37 @@ void test_update_dpp_long(long x, global long *p) {
175167
}
176168

177169
// CHECK-LABEL: @test_update_dpp_float
178-
// CHECK: %0 = bitcast float %x to i32
179-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
180-
// CHECK-NEXT: store i32 %1,
170+
// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float %x, float %x, i32 257, i32 15, i32 15, i1 false)
171+
// CHECK-NEXT: store float %0,
181172
void test_update_dpp_float(float x, global float *p) {
182173
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
183174
}
184175

185176
// CHECK-LABEL: @test_update_dpp_double
186-
// CHECK: %0 = bitcast double %x to i64
187-
// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
188-
// CHECK-NEXT: store i64 %1,
177+
// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double %x, double %x, i32 257, i32 15, i32 15, i1 false)
178+
// CHECK-NEXT: store double %0,
189179
void test_update_dpp_double(double x, global double *p) {
190180
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
191181
}
192182

193183
// CHECK-LABEL: @test_update_dpp_short
194-
// CHECK: %0 = zext i16 %x to i32
195-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
196-
// CHECK-NEXT: %2 = trunc i32 %1 to i16
197-
// CHECK-NEXT: store i16 %2,
184+
// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 %x, i16 %x, i32 257, i32 15, i32 15, i1 false)
185+
// CHECK-NEXT: store i16 %0,
198186
void test_update_dpp_short(short x, global short *p) {
199187
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
200188
}
201189

202190
// CHECK-LABEL: @test_update_dpp_char
203-
// CHECK: %0 = zext i8 %x to i32
204-
// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
205-
// CHECK-NEXT: %2 = trunc i32 %1 to i8
206-
// CHECK-NEXT: store i8 %2,
191+
// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 %x, i8 %x, i32 257, i32 15, i32 15, i1 false)
192+
// CHECK-NEXT: store i8 %0,
207193
void test_update_dpp_char(char x, global char *p) {
208194
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
209195
}
210196

211197
// CHECK-LABEL: @test_update_dpp_half
212-
// CHECK: %0 = load i16,
213-
// CHECK: %1 = zext i16 %0 to i32
214-
// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
215-
// CHECK-NEXT: %3 = trunc i32 %2 to i16
216-
// CHECK-NEXT: store i16 %3,
198+
// CHECK: %0 = load half,
199+
// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half %0, half %0, i32 257, i32 15, i32 15, i1 false)
200+
// CHECK-NEXT: store half %1,
217201
void test_update_dpp_half(half *x, global half *p) {
218202
*p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
219203
}

0 commit comments

Comments
 (0)