llvm · akadutta · Oct 27, 2025 · Oct 28, 2025 · Oct 29, 2025 · Nov 1, 2025
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2694,12 +2694,13 @@ def : GCNPat<pat,
                                   $src1, /* clamp */ 0, /* op_sel */ 0)
 >;
 
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+def : GCNPat<(DivergentTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
         (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
                                   /* src1_modifiers */ 0, $src1,
                                   /* src2_modifiers */ 0,
                                   $src2, /* clamp */ 0, /* op_sel */ 0)
 >;
+
 } // isGFX9GFX10
 } // end True16Predicate = NotHasTrue16BitInsts
 
@@ -2722,12 +2723,13 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
                           (i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
                           /* clamp */ 0, /* op_sel */ 0)>;
 
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+def : GCNPat<(DivergentTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
           (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
                           /* src1_modifiers */ 0, $src1,
                           /* src2_modifiers */ 0,
                           (EXTRACT_SUBREG VGPR_32:$src2, lo16),
                           /* clamp */ 0, /* op_sel */ 0)>;
+
 } // end True16Predicate = UseRealTrue16Insts
 
 let True16Predicate = UseFakeTrue16Insts in {
@@ -2757,12 +2759,13 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
                                 $src1, /* clamp */ 0, /* op_sel */ 0)
 >;
 
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
+def : GCNPat<(DivergentTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
      (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
                                 /* src1_modifiers */ 0, $src1,
                                 /* src2_modifiers */ 0,
                                 $src2, /* clamp */ 0, /* op_sel */ 0)
 >;
+
 } // end True16Predicate = UseFakeTrue16Insts
 
 /********** ====================== **********/
@@ -3852,6 +3855,14 @@ class PackB32Pat<Instruction inst> : GCNPat <
 >;
 }
 let SubtargetPredicate = isGFX9Plus in {
+  def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
+      (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
+  >;
+
+  def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
+      (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
+  >;
+
 let True16Predicate = NotHasTrue16BitInsts in
   def : PackB32Pat<V_PACK_B32_F16_e64>;
 

diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -207,6 +207,17 @@ class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
   let GISelPredicateCode = [{return true;}];
 }
 
+class DivergentTernaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1, node:$src2),
+  (Op $src0, $src1, $src2),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
 
 let isMoveImm = 1 in {
   let isReMaterializable = 1, isAsCheapAsAMove = 1 in {