-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Simplify dpp builtin handling #115090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/rampitec/11-04-_amdgpu_allow_lane-op_lowering_for_illegal_types
Are you sure you want to change the base?
[AMDGPU] Simplify dpp builtin handling #115090
Conversation
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-libcxx @llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) ChangesDPP intrinsics can handle any type now, so no need to cast to The caveat is that intrinsics only handle backend legal types, Full diff: https://github.com/llvm/llvm-project/pull/115090.diff 3 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 82770a75af23e4..7e3e6463799fb6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19193,37 +19193,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
assert(Error == ASTContext::GE_None && "Should not codegen an error");
llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
- unsigned Size = DataTy->getPrimitiveSizeInBits();
- llvm::Type *IntTy =
- llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
Function *F =
CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
? Intrinsic::amdgcn_mov_dpp8
: Intrinsic::amdgcn_update_dpp,
- IntTy);
+ DataTy);
assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
E->getNumArgs() == 2);
bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
if (InsertOld)
- Args.push_back(llvm::PoisonValue::get(IntTy));
- for (unsigned I = 0; I != E->getNumArgs(); ++I) {
+ Args.push_back(llvm::PoisonValue::get(DataTy));
+ Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, 0, E));
+ for (unsigned I = 1; I != E->getNumArgs(); ++I) {
llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
- if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
- Size < 32) {
- if (!DataTy->isIntegerTy())
- V = Builder.CreateBitCast(
- V, llvm::IntegerType::get(Builder.getContext(), Size));
- V = Builder.CreateZExtOrBitCast(V, IntTy);
- }
llvm::Type *ExpTy =
F->getFunctionType()->getFunctionParamType(I + InsertOld);
Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
}
- Value *V = Builder.CreateCall(F, Args);
- if (Size < 32 && !DataTy->isIntegerTy())
- V = Builder.CreateTrunc(
- V, llvm::IntegerType::get(Builder.getContext(), Size));
- return Builder.CreateTruncOrBitCast(V, DataTy);
+ return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index a4054cba236dd2..7e4ee6f4a942db 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -36,45 +36,37 @@ void test_mov_dpp8_long(global long* out, long a) {
}
// CHECK-LABEL: @test_mov_dpp8_float(
-// CHECK: %0 = bitcast float %a to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: store i32 %1,
+// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.mov.dpp8.f32(float %a, i32 1)
+// CHECK-NEXT: store float %0,
void test_mov_dpp8_float(global float* out, float a) {
*out = __builtin_amdgcn_mov_dpp8(a, 1);
}
// CHECK-LABEL: @test_mov_dpp8_double
-// CHECK: %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
-// CHECK-NEXT: store i64 %1,
+// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.mov.dpp8.f64(double %x, i32 1)
+// CHECK-NEXT: store double %0,
void test_mov_dpp8_double(double x, global double *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}
// CHECK-LABEL: @test_mov_dpp8_short
-// CHECK: %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.mov.dpp8.i16(i16 %x, i32 1)
+// CHECK-NEXT: store i16 %0,
void test_mov_dpp8_short(short x, global short *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}
// CHECK-LABEL: @test_mov_dpp8_char
-// CHECK: %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.mov.dpp8.i8(i8 %x, i32 1)
+// CHECK-NEXT: store i8 %0,
void test_mov_dpp8_char(char x, global char *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}
// CHECK-LABEL: @test_mov_dpp8_half
-// CHECK: %0 = load i16,
-// CHECK: %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK: %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.mov.dpp8.f16(half %0, i32 1)
+// CHECK-NEXT: store half %1,
void test_mov_dpp8_half(half *x, global half *p) {
*p = __builtin_amdgcn_mov_dpp8(*x, 1);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 269f20e2f53fe1..0c5995be5e098a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -117,45 +117,37 @@ void test_mov_dpp_long(long x, global long *p) {
}
// CHECK-LABEL: @test_mov_dpp_float
-// CHECK: %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float poison, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
void test_mov_dpp_float(float x, global float *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_double
-// CHECK: %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double poison, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
void test_mov_dpp_double(double x, global double *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_short
-// CHECK: %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 poison, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
void test_mov_dpp_short(short x, global short *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_char
-// CHECK: %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 poison, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
void test_mov_dpp_char(char x, global char *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_half
-// CHECK: %0 = load i16,
-// CHECK: %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK: %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half poison, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
void test_mov_dpp_half(half *x, global half *p) {
*p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
}
@@ -175,45 +167,37 @@ void test_update_dpp_long(long x, global long *p) {
}
// CHECK-LABEL: @test_update_dpp_float
-// CHECK: %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float %x, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
void test_update_dpp_float(float x, global float *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_double
-// CHECK: %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double %x, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
void test_update_dpp_double(double x, global double *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_short
-// CHECK: %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 %x, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
void test_update_dpp_short(short x, global short *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_char
-// CHECK: %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 %x, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
void test_update_dpp_char(char x, global char *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_half
-// CHECK: %0 = load i16,
-// CHECK: %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK: %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half %0, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
void test_update_dpp_half(half *x, global half *p) {
*p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
}
|
|
@llvm/pr-subscribers-clang-codegen Author: Stanislav Mekhanoshin (rampitec) ChangesDPP intrinsics can handle any type now, so no need to cast to The caveat is that intrinsics only handle backend legal types, Full diff: https://github.com/llvm/llvm-project/pull/115090.diff 3 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 82770a75af23e4..7e3e6463799fb6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19193,37 +19193,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
assert(Error == ASTContext::GE_None && "Should not codegen an error");
llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
- unsigned Size = DataTy->getPrimitiveSizeInBits();
- llvm::Type *IntTy =
- llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
Function *F =
CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
? Intrinsic::amdgcn_mov_dpp8
: Intrinsic::amdgcn_update_dpp,
- IntTy);
+ DataTy);
assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
E->getNumArgs() == 2);
bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
if (InsertOld)
- Args.push_back(llvm::PoisonValue::get(IntTy));
- for (unsigned I = 0; I != E->getNumArgs(); ++I) {
+ Args.push_back(llvm::PoisonValue::get(DataTy));
+ Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, 0, E));
+ for (unsigned I = 1; I != E->getNumArgs(); ++I) {
llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
- if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
- Size < 32) {
- if (!DataTy->isIntegerTy())
- V = Builder.CreateBitCast(
- V, llvm::IntegerType::get(Builder.getContext(), Size));
- V = Builder.CreateZExtOrBitCast(V, IntTy);
- }
llvm::Type *ExpTy =
F->getFunctionType()->getFunctionParamType(I + InsertOld);
Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
}
- Value *V = Builder.CreateCall(F, Args);
- if (Size < 32 && !DataTy->isIntegerTy())
- V = Builder.CreateTrunc(
- V, llvm::IntegerType::get(Builder.getContext(), Size));
- return Builder.CreateTruncOrBitCast(V, DataTy);
+ return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
index a4054cba236dd2..7e4ee6f4a942db 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -36,45 +36,37 @@ void test_mov_dpp8_long(global long* out, long a) {
}
// CHECK-LABEL: @test_mov_dpp8_float(
-// CHECK: %0 = bitcast float %a to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: store i32 %1,
+// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.mov.dpp8.f32(float %a, i32 1)
+// CHECK-NEXT: store float %0,
void test_mov_dpp8_float(global float* out, float a) {
*out = __builtin_amdgcn_mov_dpp8(a, 1);
}
// CHECK-LABEL: @test_mov_dpp8_double
-// CHECK: %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
-// CHECK-NEXT: store i64 %1,
+// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.mov.dpp8.f64(double %x, i32 1)
+// CHECK-NEXT: store double %0,
void test_mov_dpp8_double(double x, global double *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}
// CHECK-LABEL: @test_mov_dpp8_short
-// CHECK: %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.mov.dpp8.i16(i16 %x, i32 1)
+// CHECK-NEXT: store i16 %0,
void test_mov_dpp8_short(short x, global short *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}
// CHECK-LABEL: @test_mov_dpp8_char
-// CHECK: %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.mov.dpp8.i8(i8 %x, i32 1)
+// CHECK-NEXT: store i8 %0,
void test_mov_dpp8_char(char x, global char *p) {
*p = __builtin_amdgcn_mov_dpp8(x, 1);
}
// CHECK-LABEL: @test_mov_dpp8_half
-// CHECK: %0 = load i16,
-// CHECK: %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK: %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.mov.dpp8.f16(half %0, i32 1)
+// CHECK-NEXT: store half %1,
void test_mov_dpp8_half(half *x, global half *p) {
*p = __builtin_amdgcn_mov_dpp8(*x, 1);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 269f20e2f53fe1..0c5995be5e098a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -117,45 +117,37 @@ void test_mov_dpp_long(long x, global long *p) {
}
// CHECK-LABEL: @test_mov_dpp_float
-// CHECK: %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float poison, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
void test_mov_dpp_float(float x, global float *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_double
-// CHECK: %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double poison, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
void test_mov_dpp_double(double x, global double *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_short
-// CHECK: %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 poison, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
void test_mov_dpp_short(short x, global short *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_char
-// CHECK: %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 poison, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
void test_mov_dpp_char(char x, global char *p) {
*p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_mov_dpp_half
-// CHECK: %0 = load i16,
-// CHECK: %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK: %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half poison, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
void test_mov_dpp_half(half *x, global half *p) {
*p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
}
@@ -175,45 +167,37 @@ void test_update_dpp_long(long x, global long *p) {
}
// CHECK-LABEL: @test_update_dpp_float
-// CHECK: %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float %x, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
void test_update_dpp_float(float x, global float *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_double
-// CHECK: %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double %x, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
void test_update_dpp_double(double x, global double *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_short
-// CHECK: %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 %x, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
void test_update_dpp_short(short x, global short *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_char
-// CHECK: %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 %x, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
void test_update_dpp_char(char x, global char *p) {
*p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
}
// CHECK-LABEL: @test_update_dpp_half
-// CHECK: %0 = load i16,
-// CHECK: %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK: %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half %0, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
void test_update_dpp_half(half *x, global half *p) {
*p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
}
|
7a0daff to
dc5e6fe
Compare
d9f9656 to
ce7f572
Compare
dc5e6fe to
27d5137
Compare
ce7f572 to
084e347
Compare
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also teach instcombine to fold bitcast + app
27d5137 to
b178b26
Compare
084e347 to
7ccac58
Compare
It still needs downstack change to handle i8: #114887 |
b178b26 to
634483a
Compare
7ccac58 to
f3d99e4
Compare
634483a to
7c4fa7d
Compare
f3d99e4 to
f7e10b1
Compare
7c4fa7d to
70dd649
Compare
DPP intrinsics can handle any type now, so no need to cast to integer. The caveat is that intrinsics only handle backend legal types, but it does not work with i8 for example.
70dd649 to
6cc8a46
Compare
f7e10b1 to
1fe2797
Compare

DPP intrinsics can handle any type now, so no need to cast to
integer.