@@ -27,8 +27,8 @@ define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float ad
27
27
; FUNC-LABEL: {{^}}s_sqrt_f32:
28
28
; GCN: v_sqrt_f32_e32
29
29
30
- ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
31
- ; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z , PS
30
+ ; R600: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].Z
31
+ ; R600: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
32
32
define amdgpu_kernel void @s_sqrt_f32 (float addrspace (1 )* %out , float %in ) #1 {
33
33
entry:
34
34
%fdiv = call float @llvm.sqrt.f32 (float %in )
@@ -40,10 +40,10 @@ entry:
40
40
; GCN: v_sqrt_f32_e32
41
41
; GCN: v_sqrt_f32_e32
42
42
43
- ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
44
- ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W , PS
45
- ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
46
- ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
43
+ ; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].W
44
+ ; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
45
+ ; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].X
46
+ ; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
47
47
define amdgpu_kernel void @s_sqrt_v2f32 (<2 x float > addrspace (1 )* %out , <2 x float > %in ) #1 {
48
48
entry:
49
49
%fdiv = call <2 x float > @llvm.sqrt.v2f32 (<2 x float > %in )
@@ -57,14 +57,14 @@ entry:
57
57
; GCN: v_sqrt_f32_e32
58
58
; GCN: v_sqrt_f32_e32
59
59
60
- ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
61
- ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y , PS
62
- ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
63
- ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
64
- ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
65
- ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
66
- ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
67
- ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
60
+ ; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Y
61
+ ; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
62
+ ; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Z
63
+ ; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
64
+ ; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].W
65
+ ; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
66
+ ; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[4].X
67
+ ; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
68
68
define amdgpu_kernel void @s_sqrt_v4f32 (<4 x float > addrspace (1 )* %out , <4 x float > %in ) #1 {
69
69
entry:
70
70
%fdiv = call <4 x float > @llvm.sqrt.v4f32 (<4 x float > %in )
@@ -134,6 +134,16 @@ entry:
134
134
ret void
135
135
}
136
136
137
+ ; FUNC-LABEL: {{^}}recip_sqrt:
138
+ ; R600: RECIPSQRT_IEEE
139
+ ; R600-NOT: RECIP_IEEE
140
+ define amdgpu_kernel void @recip_sqrt (float addrspace (1 )* %out , float %src ) nounwind {
141
+ %sqrt = call float @llvm.sqrt.f32 (float %src )
142
+ %recipsqrt = fdiv fast float 1 .0 , %sqrt
143
+ store float %recipsqrt , float addrspace (1 )* %out , align 4
144
+ ret void
145
+ }
146
+
137
147
declare float @llvm.sqrt.f32 (float %in ) #0
138
148
declare <2 x float > @llvm.sqrt.v2f32 (<2 x float > %in ) #0
139
149
declare <4 x float > @llvm.sqrt.v4f32 (<4 x float > %in ) #0
0 commit comments