Skip to content

Commit 98773a5

Browse files
committed
Add new optimized intrinsic atan functions for x86
Adding optimized atan and atan2 for x86 for single/double precision scalar/vec128/vec256/vec512.
1 parent 3d6bee9 commit 98773a5

23 files changed

+1607
-46
lines changed

runtime/libpgmath/lib/common/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
2929
add_subdirectory("log10")
3030
add_subdirectory("log10f")
3131
add_subdirectory("logf")
32+
add_subdirectory("atan")
33+
add_subdirectory("atanf")
34+
add_subdirectory("atan2")
35+
add_subdirectory("atan2f")
3236
endif()
3337
add_subdirectory("powi")
3438
add_subdirectory("sincos")
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
2+
#
3+
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
19+
20+
get_property(FLAGS GLOBAL PROPERTY "FLAGS_X8664_L1")
21+
get_property(DEFINITIONS GLOBAL PROPERTY "DEFINITIONS_X8664_L1")
22+
23+
24+
set(SRCS_SCALAR
25+
fd_atan_scalar.cpp
26+
)
27+
28+
set(SRCS_VECTOR
29+
fd_atan_vector.cpp
30+
)
31+
32+
list(APPEND DEFINITIONS NDEBUG)
33+
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
34+
list(APPEND DEFINITIONS _GNU_SOURCE)
35+
endif()
36+
37+
# Scalar
38+
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_CPU=avx2")
39+
libmath_add_object_library("${SRCS_SCALAR}" "${FLAGS_TMP}" "${DEFINITIONS}"
40+
"atan-avx2_1")
41+
42+
43+
# Vector, Two elements
44+
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_CPU=avx2 -D_VL=2")
45+
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}"
46+
"atan-avx2_2")
47+
48+
49+
# Vector, Four elements
50+
set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_CPU=avx2 -D_VL=4")
51+
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}"
52+
"atan-avx2_4")
53+
54+
55+
# Vector, Eight elements
56+
set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_CPU=avx512 -D_VL=8")
57+
libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}"
58+
"atan-avx512_8")
59+
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
2+
/*
3+
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
#include <common.h>
20+
21+
vdouble __attribute__((noinline)) atan_d_vec(vdouble const x) {
22+
23+
unsigned long long int AbsMask = 0x7FFFFFFFFFFFFFFF;
24+
double AbsMask_as_double = *(double *)&AbsMask;
25+
26+
vdouble f_abs = vreinterpret_vd_vm(
27+
vand_vm_vm_vm(vreinterpret_vm_vd(x),
28+
vreinterpret_vm_vd(vcast_vd_d(AbsMask_as_double))));
29+
vdouble ans_sgn = vreinterpret_vd_vm(
30+
vxor_vm_vm_vm(vreinterpret_vm_vd(f_abs), vreinterpret_vm_vd(x)));
31+
32+
vopmask f_big = vgt_vo_vd_vd(f_abs, vcast_vd_d(1.0));
33+
34+
vdouble xReduced = vsel_vd_vo_vd_vd(f_big, 1.0/x, x);
35+
36+
vdouble x2 = xReduced * xReduced;
37+
vdouble x4 = x2 * x2;
38+
vdouble x8 = x4 * x4;
39+
vdouble x16 = x8 * x8;
40+
41+
//Convert our polynomial constants into vectors:
42+
const vdouble D2 = vcast_vd_d(C2);
43+
const vdouble D3 = vcast_vd_d(C3);
44+
const vdouble D4 = vcast_vd_d(C4);
45+
const vdouble D5 = vcast_vd_d(C5);
46+
const vdouble D6 = vcast_vd_d(C6);
47+
const vdouble D7 = vcast_vd_d(C7);
48+
const vdouble D8 = vcast_vd_d(C8);
49+
const vdouble D9 = vcast_vd_d(C9);
50+
const vdouble D10 = vcast_vd_d(C10);
51+
const vdouble D11 = vcast_vd_d(C11);
52+
const vdouble D12 = vcast_vd_d(C12);
53+
const vdouble D13 = vcast_vd_d(C13);
54+
const vdouble D14 = vcast_vd_d(C14);
55+
const vdouble D15 = vcast_vd_d(C15);
56+
const vdouble D16 = vcast_vd_d(C16);
57+
const vdouble D17 = vcast_vd_d(C17);
58+
const vdouble D18 = vcast_vd_d(C18);
59+
const vdouble D19 = vcast_vd_d(C19);
60+
const vdouble D20 = vcast_vd_d(C20);
61+
62+
// Estrin:
63+
// We want D2 + x2*(D3 + x2*(D4 + (.....))) = D2 + x2*D3 + x4*D4 + x6*D5 +
64+
// ... + x36 * D20
65+
66+
// First layer of Estrin
67+
vdouble L1 = vfma_vd_vd_vd_vd(x2, D3, D2);
68+
vdouble L2 = vfma_vd_vd_vd_vd(x2, D5, D4);
69+
vdouble L3 = vfma_vd_vd_vd_vd(x2, D7, D6);
70+
vdouble L4 = vfma_vd_vd_vd_vd(x2, D9, D8);
71+
vdouble L5 = vfma_vd_vd_vd_vd(x2, D11, D10);
72+
vdouble L6 = vfma_vd_vd_vd_vd(x2, D13, D12);
73+
vdouble L7 = vfma_vd_vd_vd_vd(x2, D15, D14);
74+
vdouble L8 = vfma_vd_vd_vd_vd(x2, D17, D16);
75+
vdouble L9 = vfma_vd_vd_vd_vd(x2, D19, D18);
76+
77+
// We now want:
78+
// L1 + x4*L2 + x8*L3 + x12*L4 + x16*L5 + x20*L6 + x24*L7 + x28*L8 + x32*L9
79+
// + x36*C20
80+
// (L1 + x4*L2) + x8*(L3 + x4*L4) + x16*(L5 + x4*L6) + x24*(L7 + x4*L8) +
81+
// x32(*L9 + x4*C20)
82+
83+
// Second layer of Estrin
84+
vdouble M1 = vfma_vd_vd_vd_vd(x4, L2, L1);
85+
vdouble M2 = vfma_vd_vd_vd_vd(x4, L4, L3);
86+
vdouble M3 = vfma_vd_vd_vd_vd(x4, L6, L5);
87+
vdouble M4 = vfma_vd_vd_vd_vd(x4, L8, L7);
88+
vdouble M5 = vfma_vd_vd_vd_vd(x4, D20, L9);
89+
90+
// We now want:
91+
// M1 + x8*M2 + x16*M3 + x24*M4 + x32*M5
92+
// (M1 + x8*M2) + x16*(M3 + x8*M4 + x16*M5)
93+
vdouble N1 = vfma_vd_vd_vd_vd(x8, M2, M1);
94+
vdouble N2 = vfma_vd_vd_vd_vd(x16, M5, M3 + x8 * M4);
95+
96+
vdouble poly = vfma_vd_vd_vd_vd(x16, N2, N1);
97+
98+
//This is a copysign(pi/2, x);
99+
const vdouble signedPi_2 = vreinterpret_vd_vm(vor_vm_vm_vm(
100+
vreinterpret_vm_vd(vcast_vd_d(PI_2)),
101+
vreinterpret_vm_vd(ans_sgn)));
102+
103+
vdouble result_f_big = vfma_vd_vd_vd_vd( -x2 * xReduced, poly, signedPi_2 - xReduced);
104+
vdouble result_not_f_big = vfma_vd_vd_vd_vd(x2 * xReduced, poly, xReduced);
105+
106+
vdouble result = vsel_vd_vo_vd_vd(f_big, result_f_big, result_not_f_big);
107+
108+
return result;
109+
}
110+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
2+
/*
3+
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
// P = fpminimax(atan(x),
20+
// [|1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39|],
21+
// [|double...|],[0.000000000000001;1.0], floating, relative);
22+
// const vdouble C1 = vcast_vd_d(0x1p0);
23+
const double C2 = -0x1.555555555543fp-2;
24+
const double C3 = 0x1.999999998357fp-3;
25+
const double C4 = -0x1.2492491f82b1ep-3;
26+
const double C5 = 0x1.c71c70986d997p-4;
27+
const double C6 = -0x1.745d01dfeedccp-4;
28+
const double C7 = 0x1.3b12afded14e7p-4;
29+
const double C8 = -0x1.1108885ecb366p-4;
30+
const double C9 = 0x1.e17749a95ee9fp-5;
31+
const double C10 = -0x1.ad2fb9d1c3fc2p-5;
32+
const double C11 = 0x1.7edb66d1f72d7p-5;
33+
const double C12 = -0x1.4f32588ce844dp-5;
34+
const double C13 = 0x1.16f6061fc7091p-5;
35+
const double C14 = -0x1.a6d39bcd1c5d7p-6;
36+
const double C15 = 0x1.15d9a141937d7p-6;
37+
const double C16 = -0x1.2c7c74714ff5p-7;
38+
const double C17 = 0x1.f863b451c4fffp-9;
39+
const double C18 = -0x1.3066efb84f247p-10;
40+
const double C19 = 0x1.d25b20dafefb2p-13;
41+
const double C20 = -0x1.52c1661292134p-16;
42+
43+
#define PI_2 1.57079632679489655799898173427
44+
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
2+
/*
3+
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
#include <common.h>
20+
#include <immintrin.h>
21+
#include <math.h>
22+
23+
// unsigned long int as_ulong(double x){
24+
// return *(unsigned long int*)&x;
25+
// }
26+
27+
#define _JOIN2(a,b) a##b
28+
#define JOIN2(a,b) _JOIN2(a,b)
29+
30+
#define atan_d_scalar JOIN2(__fd_atan_1_,_CPU)
31+
#define FMA __builtin_fma
32+
33+
extern "C" double atan_d_scalar(double);
34+
35+
double __attribute__((noinline)) atan_d_scalar(double x) {
36+
37+
//bool xBig = (as_ulong(fabs(x)) > as_ulong(1.0));
38+
bool xBig = (fabs(x) > 1.0);
39+
40+
double xReduced = x;
41+
42+
if (xBig) {
43+
xReduced = 1.0 / x;
44+
}
45+
46+
// We evaluate the polynomial using the Estrin scheme
47+
double x2 = xReduced * xReduced;
48+
double x4 = x2 * x2;
49+
double x8 = x4 * x4;
50+
double x16 = x8 * x8;
51+
52+
// First layer of Estrin
53+
double L1 = FMA(x2, C3, C2);
54+
double L2 = FMA(x2, C5, C4);
55+
double L3 = FMA(x2, C7, C6);
56+
double L4 = FMA(x2, C9, C8);
57+
double L5 = FMA(x2, C11, C10);
58+
double L6 = FMA(x2, C13, C12);
59+
double L7 = FMA(x2, C15, C14);
60+
double L8 = FMA(x2, C17, C16);
61+
double L9 = FMA(x2, C19, C18);
62+
63+
// We now want:
64+
// L1 + x4*L2 + x8*L3 + x12*L4 + x16*L5 + x20*L6 + x24*L7 + x28*L8 + x32*L9
65+
// + x36*C20 =
66+
//(L1 + x4*L2) + x8*(L3 + x4*L4) + x16*(L5 + x4*L6) + x24*(L7 + x4*L8) +
67+
//x32(*L9 + x4*C20)
68+
69+
// Second layer of estrin
70+
double M1 = FMA(x4, L2, L1);
71+
double M2 = FMA(x4, L4, L3);
72+
double M3 = FMA(x4, L6, L5);
73+
double M4 = FMA(x4, L8, L7);
74+
double M5 = FMA(x4, C20, L9);
75+
76+
// We now want:
77+
// M1 + x8*M2 + x16*M3 + x24*M4 + x32*M5
78+
// (M1 + x8*M2) + x16*(M3 + x8*M4 + x16*M5)
79+
80+
double N1 = FMA(x8, M2, M1);
81+
double N2 = FMA(x16, M5, M3 + x8 * M4);
82+
83+
double poly = FMA(x16, N2, N1);
84+
85+
if (xBig) {
86+
const double signedPi = copysign(PI_2, x);
87+
88+
double result_d = FMA(-x2 * xReduced, poly, (signedPi - xReduced));
89+
90+
return result_d;
91+
}
92+
93+
double result_d = FMA(x2 * xReduced, poly, xReduced);
94+
95+
return result_d;
96+
}
97+
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
2+
/*
3+
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
20+
#if !(defined _CPU)
21+
#error: please define _CPU - specific suffix to a function name
22+
#endif
23+
24+
#if !(defined _VL)
25+
#error: please define _VL - Number of elements per vector register
26+
#endif
27+
28+
29+
#include <immintrin.h>
30+
#define CONFIG 1
31+
#if ((_VL) == (2))
32+
#include "helperavx2_128.h"
33+
#elif ((_VL) == (4))
34+
#include "helperavx2.h"
35+
#elif ((_VL) == (8))
36+
#include "helperavx512f.h"
37+
#endif
38+
39+
40+
#define _JOIN4(a,b,c,d) a##b##c##d
41+
#define JOIN4(a,b,c,d) _JOIN4(a,b,c,d)
42+
43+
#define atan_d_vec JOIN4(__fd_atan_,_VL,_,_CPU)
44+
45+
extern "C" vdouble atan_d_vec(vdouble const);
46+
47+
#include <atan_d_vec.h>

0 commit comments

Comments
 (0)