[clang] Add elementwise fshl/fshr builtins (#153113)

ckoparkar · web-flow · commit c3bf73bc4ade · 2025-08-12T20:57:55.000+09:00
This patch implements `__builtin_elementwise_fshl` and `__builtin_elementwise_fshr` builtins. These map to the fshl/fshr intrinsics described here: - https://llvm.org/docs/LangRef.html#llvm-fshl-intrinsic - https://llvm.org/docs/LangRef.html#llvm-fshr-intrinsic Fixes #152555.
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
@@ -860,6 +860,15 @@ of different sizes and signs is forbidden in binary and ternary builtins.
                                                 semantics, see `LangRef
                                                 <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
                                                 for the comparison.
+T __builtin_elementwise_fshl(T x, T y, T z)     perform a funnel shift left. Concatenate x and y (x is the most        integer types
+                                                significant bits of the wide value), the combined value is shifted
+                                                left by z, and the most significant bits are extracted to produce
+                                                a result that is the same size as the original arguments.
+
+T __builtin_elementwise_fshr(T x, T y, T z)     perform a funnel shift right. Concatenate x and y (x is the most       integer types
+                                                significant bits of the wide value), the combined value is shifted
+                                                right by z, and the least significant bits are extracted to produce
+                                                a result that is the same size as the original arguments.
 ============================================== ====================================================================== =========================================
 
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
@@ -113,6 +113,8 @@ C23 Feature Support
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
+- Added ``__builtin_elementwise_fshl`` and ``__builtin_elementwise_fshr``.
+
 - Added ``__builtin_elementwise_minnumnum`` and ``__builtin_elementwise_maxnumnum``.
 
 - Trapping UBSan (e.g. ``-fsanitize-trap=undefined``) now emits a string describing the reason for
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
@@ -1514,6 +1514,18 @@ def ElementwiseSubSat : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwiseFshl : Builtin {
+  let Spellings = ["__builtin_elementwise_fshl"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseFshr : Builtin {
+  let Spellings = ["__builtin_elementwise_fshr"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ReduceMax : Builtin {
   let Spellings = ["__builtin_reduce_max"];
   let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4030,6 +4030,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_fma:
     return RValue::get(
         emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fma));
+  case Builtin::BI__builtin_elementwise_fshl:
+    return RValue::get(
+        emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fshl));
+  case Builtin::BI__builtin_elementwise_fshr:
+    return RValue::get(
+        emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fshr));
+
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
@@ -3031,6 +3031,12 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                EltwiseBuiltinArgTyRestriction::IntegerTy))
       return ExprError();
     break;
+  case Builtin::BI__builtin_elementwise_fshl:
+  case Builtin::BI__builtin_elementwise_fshr:
+    if (BuiltinElementwiseTernaryMath(
+            TheCall, EltwiseBuiltinArgTyRestriction::IntegerTy))
+      return ExprError();
+    break;
   case Builtin::BI__builtin_elementwise_min:
   case Builtin::BI__builtin_elementwise_max:
     if (BuiltinElementwiseMath(TheCall))
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -1179,3 +1179,89 @@ void test_builtin_elementwise_fma(float f32, double f64,
   half2 tmp2_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, (half2)4.0);
 
 }
+
+void test_builtin_elementwise_fshl(long long int i1, long long int i2,
+				   long long int i3, unsigned short us1,
+                                   unsigned short us2, unsigned short us3,
+                                   char c1, char c2, char c3,
+                                   unsigned char uc1, unsigned char uc2,
+                                   unsigned char uc3,  si8 vi1, si8 vi2,
+                                   si8 vi3, u4 vu1, u4 vu2, u4 vu3) {
+  // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr
+  // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr
+  // CHECK-NEXT: [[I3:%.+]] = load i64, ptr %i3.addr
+  // CHECK-NEXT: [[I4:%.+]] = call i64 @llvm.fshl.i64(i64 [[I1]], i64 [[I2]], i64 [[I3]])
+  // CHECK-NEXT:              store i64 [[I4]], ptr %tmp_lli_l
+  // CHECK-NEXT: [[I5:%.+]] = load i64, ptr %i1.addr
+  // CHECK-NEXT: [[I6:%.+]] = load i64, ptr %i2.addr
+  // CHECK-NEXT: [[I7:%.+]] = load i64, ptr %i3.addr
+  // CHECK-NEXT: [[I8:%.+]] = call i64 @llvm.fshr.i64(i64 [[I5]], i64 [[I6]], i64 [[I7]])
+  // CHECK-NEXT:              store i64 [[I8]], ptr %tmp_lli_r
+  long long int tmp_lli_l = __builtin_elementwise_fshl(i1, i2, i3);
+  long long int tmp_lli_r = __builtin_elementwise_fshr(i1, i2, i3);
+
+  // CHECK:      [[US1:%.+]] = load i16, ptr %us1.addr
+  // CHECK-NEXT: [[US2:%.+]] = load i16, ptr %us2.addr
+  // CHECK-NEXT: [[US3:%.+]] = load i16, ptr %us3.addr
+  // CHECK-NEXT: [[US4:%.+]] = call i16 @llvm.fshl.i16(i16 [[US1]], i16 [[US2]], i16 [[US3]])
+  // CHECK-NEXT:               store i16 [[US4]], ptr %tmp_usi_l
+  // CHECK-NEXT: [[US5:%.+]] = load i16, ptr %us1.addr
+  // CHECK-NEXT: [[US6:%.+]] = load i16, ptr %us2.addr
+  // CHECK-NEXT: [[US7:%.+]] = load i16, ptr %us3.addr
+  // CHECK-NEXT: [[US8:%.+]] = call i16 @llvm.fshr.i16(i16 [[US5]], i16 [[US6]], i16 [[US7]])
+  // CHECK-NEXT:               store i16 [[US8]], ptr %tmp_usi_r
+  unsigned short tmp_usi_l = __builtin_elementwise_fshl(us1, us2, us3);
+  unsigned short tmp_usi_r = __builtin_elementwise_fshr(us1, us2, us3);
+
+  // CHECK:      [[C1:%.+]] = load i8, ptr %c1.addr
+  // CHECK-NEXT: [[C2:%.+]] = load i8, ptr %c2.addr
+  // CHECK-NEXT: [[C3:%.+]] = load i8, ptr %c3.addr
+  // CHECK-NEXT: [[C4:%.+]] = call i8 @llvm.fshl.i8(i8 [[C1]], i8 [[C2]], i8 [[C3]])
+  // CHECK-NEXT:              store i8 [[C4]], ptr %tmp_c_l
+  // CHECK-NEXT: [[C5:%.+]] = load i8, ptr %c1.addr
+  // CHECK-NEXT: [[C6:%.+]] = load i8, ptr %c2.addr
+  // CHECK-NEXT: [[C7:%.+]] = load i8, ptr %c3.addr
+  // CHECK-NEXT: [[C8:%.+]] = call i8 @llvm.fshr.i8(i8 [[C5]], i8 [[C6]], i8 [[C7]])
+  // CHECK-NEXT:              store i8 [[C8]], ptr %tmp_c_r
+  char tmp_c_l = __builtin_elementwise_fshl(c1, c2, c3);
+  char tmp_c_r = __builtin_elementwise_fshr(c1, c2, c3);
+
+  // CHECK:      [[UC1:%.+]] = load i8, ptr %uc1.addr
+  // CHECK-NEXT: [[UC2:%.+]] = load i8, ptr %uc2.addr
+  // CHECK-NEXT: [[UC3:%.+]] = load i8, ptr %uc3.addr
+  // CHECK-NEXT: [[UC4:%.+]] = call i8 @llvm.fshl.i8(i8 [[UC1]], i8 [[UC2]], i8 [[UC3]])
+  // CHECK-NEXT:               store i8 [[UC4]], ptr %tmp_uc_l
+  // CHECK-NEXT: [[UC5:%.+]] = load i8, ptr %uc1.addr
+  // CHECK-NEXT: [[UC6:%.+]] = load i8, ptr %uc2.addr
+  // CHECK-NEXT: [[UC7:%.+]] = load i8, ptr %uc3.addr
+  // CHECK-NEXT: [[UC8:%.+]] = call i8 @llvm.fshr.i8(i8 [[UC5]], i8 [[UC6]], i8 [[UC7]])
+  // CHECK-NEXT:               store i8 [[UC8]], ptr %tmp_uc_r
+  unsigned char tmp_uc_l = __builtin_elementwise_fshl(uc1, uc2, uc3);
+  unsigned char tmp_uc_r = __builtin_elementwise_fshr(uc1, uc2, uc3);
+
+  // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr
+  // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr
+  // CHECK-NEXT: [[VI3:%.+]] = load <8 x i16>, ptr %vi3.addr
+  // CHECK-NEXT: [[VI4:%.+]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]], <8 x i16> [[VI3]])
+  // CHECK-NEXT:               store <8 x i16> [[VI4]], ptr %tmp_vi_l
+  // CHECK-NEXT: [[VI5:%.+]] = load <8 x i16>, ptr %vi1.addr
+  // CHECK-NEXT: [[VI6:%.+]] = load <8 x i16>, ptr %vi2.addr
+  // CHECK-NEXT: [[VI7:%.+]] = load <8 x i16>, ptr %vi3.addr
+  // CHECK-NEXT: [[VI8:%.+]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[VI5]], <8 x i16> [[VI6]], <8 x i16> [[VI7]])
+  // CHECK-NEXT:               store <8 x i16> [[VI8]], ptr %tmp_vi_r
+  si8 tmp_vi_l = __builtin_elementwise_fshl(vi1, vi2, vi3);
+  si8 tmp_vi_r = __builtin_elementwise_fshr(vi1, vi2, vi3);
+
+  // CHECK:      [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr
+  // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr
+  // CHECK-NEXT: [[VU3:%.+]] = load <4 x i32>, ptr %vu3.addr
+  // CHECK-NEXT: [[VU4:%.+]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]], <4 x i32> [[VU3]])
+  // CHECK-NEXT:               store <4 x i32> [[VU4]], ptr %tmp_vu_l
+  // CHECK-NEXT: [[VU5:%.+]] = load <4 x i32>, ptr %vu1.addr
+  // CHECK-NEXT: [[VU6:%.+]] = load <4 x i32>, ptr %vu2.addr
+  // CHECK-NEXT: [[VU7:%.+]] = load <4 x i32>, ptr %vu3.addr
+  // CHECK-NEXT: [[VU8:%.+]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[VU5]], <4 x i32> [[VU6]], <4 x i32> [[VU7]])
+  // CHECK-NEXT:               store <4 x i32> [[VU8]], ptr %tmp_vu_r
+  u4 tmp_vu_l = __builtin_elementwise_fshl(vu1, vu2, vu3);
+  u4 tmp_vu_r = __builtin_elementwise_fshr(vu1, vu2, vu3);
+}
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
@@ -1294,6 +1294,42 @@ void test_builtin_elementwise_fma(int i32, int2 v2i32, short i16,
   // expected-error@-1 {{3rd argument must be a scalar or vector of floating-point types (was '_Complex float')}}
 }
 
+void test_builtin_elementwise_fsh(int i32, int2 v2i32, short i16, int3 v3i32,
+				  double f64, float f32, float2 v2f32) {
+    i32 = __builtin_elementwise_fshl();
+    // expected-error@-1 {{too few arguments to function call, expected 3, have 0}}
+
+    i32 = __builtin_elementwise_fshr();
+    // expected-error@-1 {{too few arguments to function call, expected 3, have 0}}
+
+    i32 = __builtin_elementwise_fshl(i32, i32);
+    // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+
+    i32 = __builtin_elementwise_fshr(i32, i32);
+    // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+
+    i32 = __builtin_elementwise_fshl(i32, i32, i16);
+    // expected-error@-1 {{arguments are of different types ('int' vs 'short')}}
+
+    i16 = __builtin_elementwise_fshr(i16, i32, i16);
+    // expected-error@-1 {{arguments are of different types ('short' vs 'int')}}
+
+    f32 = __builtin_elementwise_fshl(f32, f32, f32);
+    // expected-error@-1 {{argument must be a scalar or vector of integer types (was 'float')}}
+
+    f64 = __builtin_elementwise_fshr(f64, f64, f64);
+    // expected-error@-1 {{argument must be a scalar or vector of integer types (was 'double')}}
+
+    v2i32 = __builtin_elementwise_fshl(v2i32, v2i32, v2f32);
+    // expected-error@-1 {{argument must be a scalar or vector of integer types (was 'float2' (vector of 2 'float' values))}}
+
+    v2i32 = __builtin_elementwise_fshr(v2i32, v2i32, v3i32);
+    // expected-error@-1 {{arguments are of different types ('int2' (vector of 2 'int' values) vs 'int3' (vector of 3 'int' values))}}
+
+    v3i32 = __builtin_elementwise_fshl(v3i32, v3i32, v2i32);
+    // expected-error@-1 {{arguments are of different types ('int3' (vector of 3 'int' values) vs 'int2' (vector of 2 'int' values))}}
+}
+
 typedef struct {
   float3 b;
 } struct_float3;