Skip to content

Commit 9f102a9

Browse files
authored
[AMDGPU] Recognise bitmask operations as srcmods on select (#152119)
Add to the VOP patterns to recognise when or/xor/and are masking only the most significant bit of i32/v2i32/i64 and replace with the corresponding FP source modifier.
1 parent 4e11f89 commit 9f102a9

File tree

6 files changed

+1331
-282
lines changed

6 files changed

+1331
-282
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3212,6 +3212,44 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
32123212
Src = Src.getOperand(0);
32133213
}
32143214

3215+
if (Mods != SISrcMods::NONE)
3216+
return true;
3217+
3218+
// Convert various sign-bit masks on integers to src mods. Currently disabled
3219+
// for 16-bit types as the codegen replaces the operand without adding a
3220+
// srcmod. This is intentionally finding the cases where we are performing
3221+
// float neg and abs on int types, the goal is not to obtain two's complement
3222+
// neg or abs. Limit converison to select operands via the nonCanonalizing
3223+
// pattern.
3224+
// TODO: Add 16-bit support.
3225+
if (IsCanonicalizing)
3226+
return true;
3227+
3228+
unsigned Opc = Src->getOpcode();
3229+
EVT VT = Src.getValueType();
3230+
if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3231+
(VT != MVT::i32 && VT != MVT::i64))
3232+
return true;
3233+
3234+
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Src->getOperand(1));
3235+
if (!CRHS)
3236+
return true;
3237+
3238+
// Recognise (xor a, 0x80000000) as NEG SrcMod.
3239+
// Recognise (and a, 0x7fffffff) as ABS SrcMod.
3240+
// Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers.
3241+
if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3242+
Mods |= SISrcMods::NEG;
3243+
Src = Src.getOperand(0);
3244+
} else if (Opc == ISD::AND && AllowAbs &&
3245+
CRHS->getAPIntValue().isMaxSignedValue()) {
3246+
Mods |= SISrcMods::ABS;
3247+
Src = Src.getOperand(0);
3248+
} else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3249+
Mods |= SISrcMods::ABS | SISrcMods::NEG;
3250+
Src = Src.getOperand(0);
3251+
}
3252+
32153253
return true;
32163254
}
32173255

llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
349349
; GCN: ; %bb.0:
350350
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351351
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
352-
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
353-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
354352
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
355-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
356-
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
353+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
354+
; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc
357355
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
358-
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
356+
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc
359357
; GCN-NEXT: s_setpc_b64 s[30:31]
360358
;
361359
; GFX11-LABEL: select_fneg_xor_select_i32:
362360
; GFX11: ; %bb.0:
363361
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364362
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
365-
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
366363
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
367-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
364+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
368365
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
369-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
370-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
366+
; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
371367
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
372-
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
373-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
374-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
368+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
369+
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
375370
; GFX11-NEXT: s_setpc_b64 s[30:31]
376371
%fneg0 = xor i32 %arg0, -2147483648
377372
%select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
550545
; GCN: ; %bb.0:
551546
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
552547
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
553-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
554-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
555548
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
549+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
556550
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
557-
; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
558-
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
551+
; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc
559552
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
560-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
553+
; GCN-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc
561554
; GCN-NEXT: s_setpc_b64 s[30:31]
562555
;
563556
; GFX11-LABEL: select_fneg_xor_select_i64:
564557
; GFX11: ; %bb.0:
565558
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
566559
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
567-
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
568-
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
569-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
560+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
570561
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
571-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
572-
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
573-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
562+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
563+
; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc_lo
574564
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
575-
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
576-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
577-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
565+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
566+
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc_lo
578567
; GFX11-NEXT: s_setpc_b64 s[30:31]
579568
%fneg0 = xor i64 %arg0, 9223372036854775808
580569
%select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
6+
7+
; Demonstrate that the conversion of bitmasks affecting the sign bit on integers to srcmods
8+
; does not apply to canonicalizing instructions.
9+
10+
define double @v_uitofp_i32_to_f64_abs(i32 %arg0) nounwind {
11+
; GCN-LABEL: v_uitofp_i32_to_f64_abs:
12+
; GCN: ; %bb.0:
13+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
15+
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
16+
; GCN-NEXT: s_setpc_b64 s[30:31]
17+
;
18+
; GFX11-LABEL: v_uitofp_i32_to_f64_abs:
19+
; GFX11: ; %bb.0:
20+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21+
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
22+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
23+
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
24+
; GFX11-NEXT: s_setpc_b64 s[30:31]
25+
%arg0.abs = and i32 %arg0, u0x7fffffff
26+
%cvt = uitofp i32 %arg0.abs to double
27+
ret double %cvt
28+
}
29+
30+
define double @v_uitofp_i32_to_f64_neg(i32 %arg0) nounwind {
31+
; GCN-LABEL: v_uitofp_i32_to_f64_neg:
32+
; GCN: ; %bb.0:
33+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34+
; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
35+
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
36+
; GCN-NEXT: s_setpc_b64 s[30:31]
37+
;
38+
; GFX11-LABEL: v_uitofp_i32_to_f64_neg:
39+
; GFX11: ; %bb.0:
40+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41+
; GFX11-NEXT: v_and_b32_e32 v0, 0x80000000, v0
42+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
43+
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
44+
; GFX11-NEXT: s_setpc_b64 s[30:31]
45+
%arg0.neg = and i32 %arg0, u0x80000000
46+
%cvt = uitofp i32 %arg0.neg to double
47+
ret double %cvt
48+
}
49+
50+
define double @s_uitofp_i32_to_f64_abs(i32 inreg %arg0) nounwind {
51+
; GCN-LABEL: s_uitofp_i32_to_f64_abs:
52+
; GCN: ; %bb.0:
53+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54+
; GCN-NEXT: s_bitset0_b32 s16, 31
55+
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s16
56+
; GCN-NEXT: s_setpc_b64 s[30:31]
57+
;
58+
; GFX11-LABEL: s_uitofp_i32_to_f64_abs:
59+
; GFX11: ; %bb.0:
60+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61+
; GFX11-NEXT: s_bitset0_b32 s0, 31
62+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
63+
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
64+
; GFX11-NEXT: s_setpc_b64 s[30:31]
65+
%arg0.abs = and i32 %arg0, u0x7fffffff
66+
%cvt = uitofp i32 %arg0.abs to double
67+
ret double %cvt
68+
}
69+
70+
define double @s_uitofp_i32_to_f64_neg(i32 inreg %arg0) nounwind {
71+
; GCN-LABEL: s_uitofp_i32_to_f64_neg:
72+
; GCN: ; %bb.0:
73+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74+
; GCN-NEXT: s_and_b32 s4, s16, 0x80000000
75+
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
76+
; GCN-NEXT: s_setpc_b64 s[30:31]
77+
;
78+
; GFX11-LABEL: s_uitofp_i32_to_f64_neg:
79+
; GFX11: ; %bb.0:
80+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81+
; GFX11-NEXT: s_and_b32 s0, s0, 0x80000000
82+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
83+
; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
84+
; GFX11-NEXT: s_setpc_b64 s[30:31]
85+
%arg0.neg = and i32 %arg0, u0x80000000
86+
%cvt = uitofp i32 %arg0.neg to double
87+
ret double %cvt
88+
}
89+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
90+
; GFX11-FAKE16: {{.*}}
91+
; GFX11-TRUE16: {{.*}}
92+
; GFX7: {{.*}}
93+
; GFX9: {{.*}}

0 commit comments

Comments
 (0)