[AMDGPU] Simplify dpp builtin handling

rampitec · rampitec · commit ce7f572206eb · 2024-11-05T15:38:35.000-08:00
DPP intrinsics can handle any type now, so no need to cast to
integer.

The caveat is that intrinsics only handle backend legal types,
but it does not work with i8 for example.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19193,37 +19193,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
     assert(Error == ASTContext::GE_None && "Should not codegen an error");
     llvm::Type *DataTy = ConvertType(E->getArg(0)->getType());
-    unsigned Size = DataTy->getPrimitiveSizeInBits();
-    llvm::Type *IntTy =
-        llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u));
     Function *F =
         CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8
                              ? Intrinsic::amdgcn_mov_dpp8
                              : Intrinsic::amdgcn_update_dpp,
-                         IntTy);
+                         DataTy);
     assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 ||
            E->getNumArgs() == 2);
     bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp;
     if (InsertOld)
-      Args.push_back(llvm::PoisonValue::get(IntTy));
-    for (unsigned I = 0; I != E->getNumArgs(); ++I) {
+      Args.push_back(llvm::PoisonValue::get(DataTy));
+    Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, 0, E));
+    for (unsigned I = 1; I != E->getNumArgs(); ++I) {
       llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E);
-      if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) &&
-          Size < 32) {
-        if (!DataTy->isIntegerTy())
-          V = Builder.CreateBitCast(
-              V, llvm::IntegerType::get(Builder.getContext(), Size));
-        V = Builder.CreateZExtOrBitCast(V, IntTy);
-      }
       llvm::Type *ExpTy =
           F->getFunctionType()->getFunctionParamType(I + InsertOld);
       Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy));
     }
-    Value *V = Builder.CreateCall(F, Args);
-    if (Size < 32 && !DataTy->isIntegerTy())
-      V = Builder.CreateTrunc(
-          V, llvm::IntegerType::get(Builder.getContext(), Size));
-    return Builder.CreateTruncOrBitCast(V, DataTy);
+    return Builder.CreateCall(F, Args);
   }
   case AMDGPU::BI__builtin_amdgcn_permlane16:
   case AMDGPU::BI__builtin_amdgcn_permlanex16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -36,45 +36,37 @@ void test_mov_dpp8_long(global long* out, long a) {
 }
 
 // CHECK-LABEL: @test_mov_dpp8_float(
-// CHECK:      %0 = bitcast float %a to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %0 = tail call{{.*}} float @llvm.amdgcn.mov.dpp8.f32(float %a, i32 1)
+// CHECK-NEXT: store float %0,
 void test_mov_dpp8_float(global float* out, float a) {
   *out = __builtin_amdgcn_mov_dpp8(a, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %0 = tail call{{.*}} double @llvm.amdgcn.mov.dpp8.f64(double %x, i32 1)
+// CHECK-NEXT: store double %0,
 void test_mov_dpp8_double(double x, global double *p) {
   *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %0 = tail call{{.*}} i16 @llvm.amdgcn.mov.dpp8.i16(i16 %x, i32 1)
+// CHECK-NEXT: store i16 %0,
 void test_mov_dpp8_short(short x, global short *p) {
   *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %0 = tail call{{.*}} i8 @llvm.amdgcn.mov.dpp8.i8(i8 %x, i32 1)
+// CHECK-NEXT: store i8 %0,
 void test_mov_dpp8_char(char x, global char *p) {
   *p = __builtin_amdgcn_mov_dpp8(x, 1);
 }
 
 // CHECK-LABEL: @test_mov_dpp8_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.mov.dpp8.f16(half %0, i32 1)
+// CHECK-NEXT: store half %1,
 void test_mov_dpp8_half(half *x, global half *p) {
   *p = __builtin_amdgcn_mov_dpp8(*x, 1);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -117,45 +117,37 @@ void test_mov_dpp_long(long x, global long *p) {
 }
 
 // CHECK-LABEL: @test_mov_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float poison, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
 void test_mov_dpp_float(float x, global float *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double poison, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
 void test_mov_dpp_double(double x, global double *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 poison, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
 void test_mov_dpp_short(short x, global short *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 poison, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
 void test_mov_dpp_char(char x, global char *p) {
   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_mov_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half poison, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
 void test_mov_dpp_half(half *x, global half *p) {
   *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
 }
@@ -175,45 +167,37 @@ void test_update_dpp_long(long x, global long *p) {
 }
 
 // CHECK-LABEL: @test_update_dpp_float
-// CHECK:      %0 = bitcast float %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i32 %1,
+// CHECK:      %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float %x, float %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store float %0,
 void test_update_dpp_float(float x, global float *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_double
-// CHECK:      %0 = bitcast double %x to i64
-// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: store i64 %1,
+// CHECK:      %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double %x, double %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store double %0,
 void test_update_dpp_double(double x, global double *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_short
-// CHECK:      %0 = zext i16 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i16
-// CHECK-NEXT: store i16 %2,
+// CHECK:      %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 %x, i16 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i16 %0,
 void test_update_dpp_short(short x, global short *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_char
-// CHECK:      %0 = zext i8 %x to i32
-// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %2 = trunc i32 %1 to i8
-// CHECK-NEXT: store i8 %2,
+// CHECK:      %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 %x, i8 %x, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store i8 %0,
 void test_update_dpp_char(char x, global char *p) {
   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 }
 
 // CHECK-LABEL: @test_update_dpp_half
-// CHECK:      %0 = load i16,
-// CHECK:      %1 = zext i16 %0 to i32
-// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
-// CHECK-NEXT: %3 = trunc i32 %2 to i16
-// CHECK-NEXT: store i16 %3,
+// CHECK:      %0 = load half,
+// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half %0, half %0, i32 257, i32 15, i32 15, i1 false)
+// CHECK-NEXT: store half %1,
 void test_update_dpp_half(half *x, global half *p) {
   *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
 }