update the operand check & update machine inst uniformity

PankajDwivedi-25 · PankajDwivedi-25 · commit 5647603bfcda · 2025-11-19T18:18:06.000+05:30
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -118,8 +118,11 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
     const Instruction &I, InstructionUniformity IU) const {
   switch (IU) {
   case InstructionUniformity::AnyOfFirstTwoUseOp:
-    return !isDivergentUse(I.getOperandUse(0)) ||
-           !isDivergentUse(I.getOperandUse(1));
+    // For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
+    // <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
+    // lane select) is uniform
+    return !isDivergentUse(I.getOperandUse(1)) ||
+           !isDivergentUse(I.getOperandUse(2));
   default:
     return false;
   }
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -53,13 +53,18 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
   for (const MachineBasicBlock &block : F) {
     for (const MachineInstr &instr : block) {
       auto uniformity = InstrInfo.getInstructionUniformity(instr);
-      if (uniformity == InstructionUniformity::AlwaysUniform) {
-        addUniformOverride(instr);
-        continue;
-      }
-
-      if (uniformity == InstructionUniformity::NeverUniform) {
+      switch (uniformity) {
+      case InstructionUniformity::NeverUniform:
         markDivergent(instr);
+        break;
+      case InstructionUniformity::AlwaysUniform:
+        addUniformOverride(instr);
+        break;
+      case InstructionUniformity::Default:
+        break;
+      default:
+        addUniformInstruction(&instr, uniformity);
+        break;
       }
     }
   }
@@ -148,11 +153,42 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
   return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
 }
 
-// This can be defined later depending on use of the MachineUniformityAnalysis.
 template <>
 bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
     const MachineInstr &MI, InstructionUniformity IU) const {
-  return false;
+  switch (IU) {
+  // For permlane16/permlanex16, check if either src or lane select is uniform
+  // These instructions have mixed immediate and register operands:
+  // Operand 1 is src0 (the source value to permute)
+  // Operand 3 is src1 (lane select - which lane within the 16 to read from)
+  // Result is uniform if EITHER the source OR lane select is uniform
+  case InstructionUniformity::AnyOfFirstTwoUseOp: {
+    // Check if any of the first two register use operands is uniform
+    // Result is uniform if ANY of these operands is uniform
+    const MachineOperand *FirstRegOp = nullptr;
+    const MachineOperand *SecondRegOp = nullptr;
+
+    // Find the first two register use operands
+    for (const MachineOperand &MO : MI.uses()) {
+      if (MO.isReg() && MO.getReg().isVirtual()) {
+        if (!FirstRegOp)
+          FirstRegOp = &MO;
+        else if (!SecondRegOp) {
+          SecondRegOp = &MO;
+          break;
+        }
+      }
+    }
+
+    if (!FirstRegOp || !SecondRegOp)
+      return false;
+
+    // Return true if either operand is uniform
+    return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
+  }
+  default:
+    return false;
+  }
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10574,6 +10574,13 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
     return InstructionUniformity::NeverUniform;
 
   unsigned opcode = MI.getOpcode();
+
+  // Special handling for permlane16/permlanex16 - uniformity depends on
+  // operands
+  if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+      opcode == AMDGPU::V_PERMLANEX16_B32_e64)
+    return InstructionUniformity::AnyOfFirstTwoUseOp;
+
   if (opcode == AMDGPU::V_READLANE_B32 ||
       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
       opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
@@ -0,0 +1,86 @@
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+# Test the machine-level uniformity analysis for permlane16/permlanex16 instructions.
+#
+# NOTE: Permlane instructions have a hardware constraint that src1 (lane select) and src2
+# must be SGPR (scalar) registers. Since SGPRs are always uniform at machine level, 
+# permlane results are always uniform according to the AnyOfFirstTwoUseOp logic
+# (either src0 OR src1 being uniform makes the result uniform, and src1 is always uniform).
+#
+# These tests verify that the uniformity analysis correctly handles permlane instructions
+# and that uniform results propagate through chains of operations.
+
+---
+# Test: permlane16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlane16_basic
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 5
+    %2:sreg_32 = IMPLICIT_DEF
+    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+# Test: permlanex16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlanex16_basic
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 7
+    %2:sreg_32 = IMPLICIT_DEF
+    %3:vgpr_32 = V_PERMLANEX16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+# Test: Chain of permlane operations - uniformity propagates
+# Both permlanes are uniform, second uses result of first as source
+name: permlane16_chain_uniform
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 3
+    %2:sreg_32 = IMPLICIT_DEF
+    ; First permlane - uniform because lane select is SGPR
+    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    ; Second permlane uses uniform result - also uniform
+    %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+# Test: Multiple permlane operations in sequence
+# Verifies that uniformity is correctly tracked through complex chains
+name: permlane_multiple
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 1
+    %2:sreg_32 = S_MOV_B32 2  
+    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+    %5:vgpr_32 = V_PERMLANE16_B32_e64 0, %4, 0, %2, 0, %1, %4, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -16,11 +16,11 @@ define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i
 }
 
 ; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
 ; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
 define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v1, ptr addrspace(1) %out
   ret void
 }
@@ -36,11 +36,11 @@ define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32
 }
 
 ; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
 ; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
 define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v1, ptr addrspace(1) %out
   ret void
 }