Skip to content

Commit 5647603

Browse files
update the operand check & update machine inst uniformity
1 parent 4f71ec7 commit 5647603

File tree

5 files changed

+146
-14
lines changed

5 files changed

+146
-14
lines changed

llvm/lib/Analysis/UniformityAnalysis.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,11 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
118118
const Instruction &I, InstructionUniformity IU) const {
119119
switch (IU) {
120120
case InstructionUniformity::AnyOfFirstTwoUseOp:
121-
return !isDivergentUse(I.getOperandUse(0)) ||
122-
!isDivergentUse(I.getOperandUse(1));
121+
// For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
122+
// <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
123+
// lane select) is uniform
124+
return !isDivergentUse(I.getOperandUse(1)) ||
125+
!isDivergentUse(I.getOperandUse(2));
123126
default:
124127
return false;
125128
}

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,18 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
5353
for (const MachineBasicBlock &block : F) {
5454
for (const MachineInstr &instr : block) {
5555
auto uniformity = InstrInfo.getInstructionUniformity(instr);
56-
if (uniformity == InstructionUniformity::AlwaysUniform) {
57-
addUniformOverride(instr);
58-
continue;
59-
}
60-
61-
if (uniformity == InstructionUniformity::NeverUniform) {
56+
switch (uniformity) {
57+
case InstructionUniformity::NeverUniform:
6258
markDivergent(instr);
59+
break;
60+
case InstructionUniformity::AlwaysUniform:
61+
addUniformOverride(instr);
62+
break;
63+
case InstructionUniformity::Default:
64+
break;
65+
default:
66+
addUniformInstruction(&instr, uniformity);
67+
break;
6368
}
6469
}
6570
}
@@ -148,11 +153,42 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
148153
return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
149154
}
150155

151-
// This can be defined later depending on use of the MachineUniformityAnalysis.
152156
template <>
153157
bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
154158
const MachineInstr &MI, InstructionUniformity IU) const {
155-
return false;
159+
switch (IU) {
160+
// For permlane16/permlanex16, check if either src or lane select is uniform
161+
// These instructions have mixed immediate and register operands:
162+
// Operand 1 is src0 (the source value to permute)
163+
// Operand 3 is src1 (lane select - which lane within the 16 to read from)
164+
// Result is uniform if EITHER the source OR lane select is uniform
165+
case InstructionUniformity::AnyOfFirstTwoUseOp: {
166+
// Check if any of the first two register use operands is uniform
167+
// Result is uniform if ANY of these operands is uniform
168+
const MachineOperand *FirstRegOp = nullptr;
169+
const MachineOperand *SecondRegOp = nullptr;
170+
171+
// Find the first two register use operands
172+
for (const MachineOperand &MO : MI.uses()) {
173+
if (MO.isReg() && MO.getReg().isVirtual()) {
174+
if (!FirstRegOp)
175+
FirstRegOp = &MO;
176+
else if (!SecondRegOp) {
177+
SecondRegOp = &MO;
178+
break;
179+
}
180+
}
181+
}
182+
183+
if (!FirstRegOp || !SecondRegOp)
184+
return false;
185+
186+
// Return true if either operand is uniform
187+
return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
188+
}
189+
default:
190+
return false;
191+
}
156192
}
157193

158194
// This ensures explicit instantiation of

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10574,6 +10574,13 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
1057410574
return InstructionUniformity::NeverUniform;
1057510575

1057610576
unsigned opcode = MI.getOpcode();
10577+
10578+
// Special handling for permlane16/permlanex16 - uniformity depends on
10579+
// operands
10580+
if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
10581+
opcode == AMDGPU::V_PERMLANEX16_B32_e64)
10582+
return InstructionUniformity::AnyOfFirstTwoUseOp;
10583+
1057710584
if (opcode == AMDGPU::V_READLANE_B32 ||
1057810585
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
1057910586
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
2+
3+
# Test the machine-level uniformity analysis for permlane16/permlanex16 instructions.
4+
#
5+
# NOTE: Permlane instructions have a hardware constraint that src1 (lane select) and src2
6+
# must be SGPR (scalar) registers. Since SGPRs are always uniform at machine level,
7+
# permlane results are always uniform according to the AnyOfFirstTwoUseOp logic
8+
# (either src0 OR src1 being uniform makes the result uniform, and src1 is always uniform).
9+
#
10+
# These tests verify that the uniformity analysis correctly handles permlane instructions
11+
# and that uniform results propagate through chains of operations.
12+
13+
---
14+
# Test: permlane16 with divergent VGPR src and uniform SGPR lane select
15+
# Result is UNIFORM because lane select (SGPR) is always uniform
16+
name: permlane16_basic
17+
machineFunctionInfo:
18+
isEntryFunction: true
19+
body: |
20+
bb.0:
21+
; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
22+
; CHECK: ALL VALUES UNIFORM
23+
%0:vgpr_32 = IMPLICIT_DEF
24+
%1:sreg_32 = S_MOV_B32 5
25+
%2:sreg_32 = IMPLICIT_DEF
26+
%3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
27+
S_ENDPGM 0
28+
29+
...
30+
---
31+
# Test: permlanex16 with divergent VGPR src and uniform SGPR lane select
32+
# Result is UNIFORM because lane select (SGPR) is always uniform
33+
name: permlanex16_basic
34+
machineFunctionInfo:
35+
isEntryFunction: true
36+
body: |
37+
bb.0:
38+
; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
39+
; CHECK: ALL VALUES UNIFORM
40+
%0:vgpr_32 = IMPLICIT_DEF
41+
%1:sreg_32 = S_MOV_B32 7
42+
%2:sreg_32 = IMPLICIT_DEF
43+
%3:vgpr_32 = V_PERMLANEX16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
44+
S_ENDPGM 0
45+
46+
...
47+
---
48+
# Test: Chain of permlane operations - uniformity propagates
49+
# Both permlanes are uniform, second uses result of first as source
50+
name: permlane16_chain_uniform
51+
machineFunctionInfo:
52+
isEntryFunction: true
53+
body: |
54+
bb.0:
55+
; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
56+
; CHECK: ALL VALUES UNIFORM
57+
%0:vgpr_32 = IMPLICIT_DEF
58+
%1:sreg_32 = S_MOV_B32 3
59+
%2:sreg_32 = IMPLICIT_DEF
60+
; First permlane - uniform because lane select is SGPR
61+
%3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
62+
; Second permlane uses uniform result - also uniform
63+
%4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
64+
S_ENDPGM 0
65+
66+
...
67+
---
68+
# Test: Multiple permlane operations in sequence
69+
# Verifies that uniformity is correctly tracked through complex chains
70+
name: permlane_multiple
71+
machineFunctionInfo:
72+
isEntryFunction: true
73+
body: |
74+
bb.0:
75+
; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
76+
; CHECK: ALL VALUES UNIFORM
77+
%0:vgpr_32 = IMPLICIT_DEF
78+
%1:sreg_32 = S_MOV_B32 1
79+
%2:sreg_32 = S_MOV_B32 2
80+
%3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
81+
%4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
82+
%5:vgpr_32 = V_PERMLANE16_B32_e64 0, %4, 0, %2, 0, %1, %4, 0, implicit $exec
83+
S_ENDPGM 0
84+
85+
...
86+

llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i
1616
}
1717

1818
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
19-
; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
19+
; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
2020
; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
2121
define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
2222
%v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
23-
%v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
23+
%v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
2424
store i32 %v1, ptr addrspace(1) %out
2525
ret void
2626
}
@@ -36,11 +36,11 @@ define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32
3636
}
3737

3838
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
39-
; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
39+
; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
4040
; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
4141
define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
4242
%v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
43-
%v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
43+
%v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
4444
store i32 %v1, ptr addrspace(1) %out
4545
ret void
4646
}

0 commit comments

Comments
 (0)