Skip to content

Commit b73942d

Browse files
jveselyzmodem
authored andcommitted
AMDGPU/EG,CM: Implement fsqrt using recip(rsqrt(x)) instead of x * rsqrt(x)
The old version might be faster on EG (RECIP_IEEE is Trans only), but it'd need extra corner case checks. This gives correct corner case behaviour and saves a register. Fixes OCL CTS sqrt test (1-thread, scalar) on Turks. Reviewer: arsenm Differential Revision: https://reviews.llvm.org/D74017 (cherry picked from commit e6686ad)
1 parent 84cda4c commit b73942d

File tree

4 files changed

+34
-18
lines changed

4 files changed

+34
-18
lines changed

llvm/lib/Target/AMDGPU/CaymanInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ def COS_cm : COS_Common<0x8E>;
5050

5151
def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
5252

53+
def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
54+
5355
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
5456

5557
defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
@@ -70,8 +72,6 @@ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
7072

7173

7274

73-
def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
74-
7575
class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
7676
CF_MEM_RAT_CACHELESS <0x14, 0, mask,
7777
(ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),

llvm/lib/Target/AMDGPU/EvergreenInstructions.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,12 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
118118
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
119119
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
120120
def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
121+
def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
122+
121123
def SIN_eg : SIN_Common<0x8D>;
122124
def COS_eg : COS_Common<0x8E>;
123125

124126
def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
125-
def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
126127
} // End SubtargetPredicate = isEG
127128

128129
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/R600Instructions.td

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1233,6 +1233,11 @@ def : R600Pat<
12331233
def : RcpPat<recip_ieee, f32>;
12341234
}
12351235

1236+
class SqrtPat<Instruction RsqInst, Instruction RecipInst> : R600Pat <
1237+
(fsqrt f32:$src),
1238+
(RecipInst (RsqInst $src))
1239+
>;
1240+
12361241
//===----------------------------------------------------------------------===//
12371242
// R600 / R700 Instructions
12381243
//===----------------------------------------------------------------------===//
@@ -1272,8 +1277,8 @@ let Predicates = [isR600] in {
12721277
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
12731278
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
12741279

1275-
def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
12761280
def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
1281+
def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
12771282

12781283
def R600_ExportSwz : ExportSwzInst {
12791284
let Word1{20-17} = 0; // BURST_COUNT

llvm/test/CodeGen/AMDGPU/fsqrt.ll

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float ad
2727
; FUNC-LABEL: {{^}}s_sqrt_f32:
2828
; GCN: v_sqrt_f32_e32
2929

30-
; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
31-
; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
30+
; R600: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].Z
31+
; R600: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
3232
define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
3333
entry:
3434
%fdiv = call float @llvm.sqrt.f32(float %in)
@@ -40,10 +40,10 @@ entry:
4040
; GCN: v_sqrt_f32_e32
4141
; GCN: v_sqrt_f32_e32
4242

43-
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
44-
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
45-
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
46-
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
43+
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].W
44+
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
45+
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].X
46+
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
4747
define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
4848
entry:
4949
%fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
@@ -57,14 +57,14 @@ entry:
5757
; GCN: v_sqrt_f32_e32
5858
; GCN: v_sqrt_f32_e32
5959

60-
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
61-
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
62-
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
63-
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
64-
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
65-
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
66-
; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
67-
; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
60+
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Y
61+
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
62+
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Z
63+
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
64+
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].W
65+
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
66+
; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[4].X
67+
; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
6868
define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
6969
entry:
7070
%fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
@@ -134,6 +134,16 @@ entry:
134134
ret void
135135
}
136136

137+
; FUNC-LABEL: {{^}}recip_sqrt:
138+
; R600: RECIPSQRT_IEEE
139+
; R600-NOT: RECIP_IEEE
140+
define amdgpu_kernel void @recip_sqrt(float addrspace(1)* %out, float %src) nounwind {
141+
%sqrt = call float @llvm.sqrt.f32(float %src)
142+
%recipsqrt = fdiv fast float 1.0, %sqrt
143+
store float %recipsqrt, float addrspace(1)* %out, align 4
144+
ret void
145+
}
146+
137147
declare float @llvm.sqrt.f32(float %in) #0
138148
declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
139149
declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0

0 commit comments

Comments
 (0)