flang-compiler
diff --git a/‎runtime/libpgmath/lib/common/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎runtime/libpgmath/lib/common/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎runtime/libpgmath/lib/common/atan/CMakeLists.txt‎
Lines changed: 59 additions & 0 deletions b/‎runtime/libpgmath/lib/common/atan/CMakeLists.txt‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎runtime/libpgmath/lib/common/atan/atan_d_vec.h‎
Lines changed: 110 additions & 0 deletions b/‎runtime/libpgmath/lib/common/atan/atan_d_vec.h‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎runtime/libpgmath/lib/common/atan/common.h‎
Lines changed: 44 additions & 0 deletions b/‎runtime/libpgmath/lib/common/atan/common.h‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎runtime/libpgmath/lib/common/atan/fd_atan_scalar.cpp‎
Lines changed: 97 additions & 0 deletions b/‎runtime/libpgmath/lib/common/atan/fd_atan_scalar.cpp‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎runtime/libpgmath/lib/common/atan/fd_atan_vector.cpp‎
Lines changed: 47 additions & 0 deletions b/‎runtime/libpgmath/lib/common/atan/fd_atan_vector.cpp‎
Lines changed: 47 additions & 0 deletions
@@ -29,6 +29,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
     add_subdirectory("log10")
     add_subdirectory("log10f")
     add_subdirectory("logf")
+    add_subdirectory("atan")
+    add_subdirectory("atanf")
+    add_subdirectory("atan2")
+    add_subdirectory("atan2f")
   endif()
     add_subdirectory("powi")
     add_subdirectory("sincos")
 
@@ -0,0 +1,59 @@
+
+#
+# Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+get_property(FLAGS GLOBAL PROPERTY "FLAGS_X8664_L1")
+get_property(DEFINITIONS GLOBAL PROPERTY "DEFINITIONS_X8664_L1")
+
+
+set(SRCS_SCALAR
+  fd_atan_scalar.cpp
+  )
+
+set(SRCS_VECTOR
+  fd_atan_vector.cpp
+  )
+
+list(APPEND DEFINITIONS NDEBUG)
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+  list(APPEND DEFINITIONS _GNU_SOURCE)
+endif()
+
+# Scalar 
+set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_CPU=avx2")
+libmath_add_object_library("${SRCS_SCALAR}" "${FLAGS_TMP}" "${DEFINITIONS}"
+"atan-avx2_1")
+
+
+# Vector, Two elements
+set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_CPU=avx2 -D_VL=2")
+libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}"
+"atan-avx2_2")
+
+
+# Vector, Four elements
+set(FLAGS_TMP "${FLAGS} -mtune=core-avx2 -march=core-avx2 -D_CPU=avx2 -D_VL=4")
+libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}"
+"atan-avx2_4")
+
+
+# Vector, Eight elements
+set(FLAGS_TMP "${FLAGS} -mtune=skylake-avx512 -march=skylake-avx512 -D_CPU=avx512 -D_VL=8")
+libmath_add_object_library("${SRCS_VECTOR}" "${FLAGS_TMP}" "${DEFINITIONS}"
+"atan-avx512_8")
+
@@ -0,0 +1,110 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <common.h>
+
+vdouble __attribute__((noinline)) atan_d_vec(vdouble const x) {
+
+    unsigned long long int AbsMask = 0x7FFFFFFFFFFFFFFF;
+    double AbsMask_as_double = *(double *)&AbsMask;
+
+    vdouble f_abs = vreinterpret_vd_vm(
+        vand_vm_vm_vm(vreinterpret_vm_vd(x),
+                      vreinterpret_vm_vd(vcast_vd_d(AbsMask_as_double))));
+    vdouble ans_sgn = vreinterpret_vd_vm(
+        vxor_vm_vm_vm(vreinterpret_vm_vd(f_abs), vreinterpret_vm_vd(x)));
+
+    vopmask f_big = vgt_vo_vd_vd(f_abs, vcast_vd_d(1.0));
+
+    vdouble xReduced = vsel_vd_vo_vd_vd(f_big, 1.0/x, x);
+
+    vdouble x2 = xReduced * xReduced;
+    vdouble x4 = x2 * x2;
+    vdouble x8 = x4 * x4;
+    vdouble x16 = x8 * x8;
+    
+    //Convert our polynomial constants into vectors:
+    const vdouble D2 = vcast_vd_d(C2);
+    const vdouble D3 = vcast_vd_d(C3);
+    const vdouble D4 = vcast_vd_d(C4);
+    const vdouble D5 = vcast_vd_d(C5);
+    const vdouble D6 = vcast_vd_d(C6);
+    const vdouble D7 = vcast_vd_d(C7);
+    const vdouble D8 = vcast_vd_d(C8);
+    const vdouble D9 = vcast_vd_d(C9);
+    const vdouble D10 = vcast_vd_d(C10);
+    const vdouble D11 = vcast_vd_d(C11);
+    const vdouble D12 = vcast_vd_d(C12);
+    const vdouble D13 = vcast_vd_d(C13);
+    const vdouble D14 = vcast_vd_d(C14);
+    const vdouble D15 = vcast_vd_d(C15);
+    const vdouble D16 = vcast_vd_d(C16);
+    const vdouble D17 = vcast_vd_d(C17);
+    const vdouble D18 = vcast_vd_d(C18);
+    const vdouble D19 = vcast_vd_d(C19);
+    const vdouble D20 = vcast_vd_d(C20);
+
+    // Estrin:
+    // We want D2 + x2*(D3 + x2*(D4 + (.....))) = D2 + x2*D3 + x4*D4 + x6*D5 +
+    // ... + x36 * D20
+
+    // First layer of Estrin
+    vdouble L1 = vfma_vd_vd_vd_vd(x2, D3, D2);
+    vdouble L2 = vfma_vd_vd_vd_vd(x2, D5, D4);
+    vdouble L3 = vfma_vd_vd_vd_vd(x2, D7, D6);
+    vdouble L4 = vfma_vd_vd_vd_vd(x2, D9, D8);
+    vdouble L5 = vfma_vd_vd_vd_vd(x2, D11, D10);
+    vdouble L6 = vfma_vd_vd_vd_vd(x2, D13, D12);
+    vdouble L7 = vfma_vd_vd_vd_vd(x2, D15, D14);
+    vdouble L8 = vfma_vd_vd_vd_vd(x2, D17, D16);
+    vdouble L9 = vfma_vd_vd_vd_vd(x2, D19, D18);
+
+    // We now want:
+    //  L1 + x4*L2 + x8*L3 + x12*L4 + x16*L5 + x20*L6 + x24*L7 + x28*L8 + x32*L9
+    //  + x36*C20
+    // (L1 + x4*L2) + x8*(L3 + x4*L4) + x16*(L5 + x4*L6) + x24*(L7 + x4*L8) +
+    // x32(*L9 + x4*C20)
+
+    // Second layer of Estrin
+    vdouble M1 = vfma_vd_vd_vd_vd(x4, L2, L1);
+    vdouble M2 = vfma_vd_vd_vd_vd(x4, L4, L3);
+    vdouble M3 = vfma_vd_vd_vd_vd(x4, L6, L5);
+    vdouble M4 = vfma_vd_vd_vd_vd(x4, L8, L7);
+    vdouble M5 = vfma_vd_vd_vd_vd(x4, D20, L9);
+
+    // We now want:
+    // M1 + x8*M2 + x16*M3 + x24*M4 + x32*M5
+    // (M1 + x8*M2) + x16*(M3 + x8*M4 + x16*M5)
+    vdouble N1 = vfma_vd_vd_vd_vd(x8, M2, M1);
+    vdouble N2 = vfma_vd_vd_vd_vd(x16, M5, M3 + x8 * M4);
+
+    vdouble poly = vfma_vd_vd_vd_vd(x16, N2, N1);
+
+    //This is a copysign(pi/2, x);
+    const vdouble signedPi_2 = vreinterpret_vd_vm(vor_vm_vm_vm(
+        vreinterpret_vm_vd(vcast_vd_d(PI_2)),
+        vreinterpret_vm_vd(ans_sgn)));
+
+    vdouble result_f_big     = vfma_vd_vd_vd_vd( -x2 * xReduced, poly, signedPi_2 - xReduced);
+    vdouble result_not_f_big = vfma_vd_vd_vd_vd(x2 * xReduced, poly, xReduced);
+    
+    vdouble result = vsel_vd_vo_vd_vd(f_big, result_f_big, result_not_f_big);
+
+    return result;
+}
+
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+    // P = fpminimax(atan(x),
+    // [|1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39|],
+    // [|double...|],[0.000000000000001;1.0], floating, relative);
+    // const vdouble C1 = vcast_vd_d(0x1p0);
+    const double C2 = -0x1.555555555543fp-2;
+    const double C3 = 0x1.999999998357fp-3;
+    const double C4 = -0x1.2492491f82b1ep-3;
+    const double C5 = 0x1.c71c70986d997p-4;
+    const double C6 = -0x1.745d01dfeedccp-4;
+    const double C7 = 0x1.3b12afded14e7p-4;
+    const double C8 = -0x1.1108885ecb366p-4;
+    const double C9 = 0x1.e17749a95ee9fp-5;
+    const double C10 = -0x1.ad2fb9d1c3fc2p-5;
+    const double C11 = 0x1.7edb66d1f72d7p-5;
+    const double C12 = -0x1.4f32588ce844dp-5;
+    const double C13 = 0x1.16f6061fc7091p-5;
+    const double C14 = -0x1.a6d39bcd1c5d7p-6;
+    const double C15 = 0x1.15d9a141937d7p-6;
+    const double C16 = -0x1.2c7c74714ff5p-7;
+    const double C17 = 0x1.f863b451c4fffp-9;
+    const double C18 = -0x1.3066efb84f247p-10;
+    const double C19 = 0x1.d25b20dafefb2p-13;
+    const double C20 = -0x1.52c1661292134p-16;
+
+    #define PI_2 1.57079632679489655799898173427
+
@@ -0,0 +1,97 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <common.h>
+#include <immintrin.h>
+#include <math.h>
+
+// unsigned long int as_ulong(double x){
+//     return *(unsigned long int*)&x;
+// }
+
+#define _JOIN2(a,b) a##b
+#define JOIN2(a,b) _JOIN2(a,b)
+
+#define atan_d_scalar JOIN2(__fd_atan_1_,_CPU)
+#define FMA __builtin_fma
+
+extern "C" double atan_d_scalar(double);
+
+double __attribute__((noinline)) atan_d_scalar(double x) {
+
+    //bool xBig = (as_ulong(fabs(x)) > as_ulong(1.0));
+    bool xBig = (fabs(x) > 1.0);
+
+    double xReduced = x;
+
+    if (xBig) {
+        xReduced = 1.0 / x;
+    }
+
+    // We evaluate the polynomial using the Estrin scheme
+    double x2 = xReduced * xReduced;
+    double x4 = x2 * x2;
+    double x8 = x4 * x4;
+    double x16 = x8 * x8;
+
+    // First layer of Estrin
+    double L1 = FMA(x2, C3, C2);
+    double L2 = FMA(x2, C5, C4);
+    double L3 = FMA(x2, C7, C6);
+    double L4 = FMA(x2, C9, C8);
+    double L5 = FMA(x2, C11, C10);
+    double L6 = FMA(x2, C13, C12);
+    double L7 = FMA(x2, C15, C14);
+    double L8 = FMA(x2, C17, C16);
+    double L9 = FMA(x2, C19, C18);
+
+    // We now want:
+    // L1 + x4*L2 + x8*L3 + x12*L4 + x16*L5 + x20*L6 + x24*L7 + x28*L8 + x32*L9
+    // + x36*C20 =
+    //(L1 + x4*L2) + x8*(L3 + x4*L4) + x16*(L5 + x4*L6) + x24*(L7 + x4*L8) +
+    //x32(*L9 + x4*C20)
+
+    // Second layer of estrin
+    double M1 = FMA(x4, L2, L1);
+    double M2 = FMA(x4, L4, L3);
+    double M3 = FMA(x4, L6, L5);
+    double M4 = FMA(x4, L8, L7);
+    double M5 = FMA(x4, C20, L9);
+
+    // We now want:
+    //  M1 + x8*M2 + x16*M3 + x24*M4 + x32*M5
+    // (M1 + x8*M2) + x16*(M3 + x8*M4 + x16*M5)
+
+    double N1 = FMA(x8, M2, M1);
+    double N2 = FMA(x16, M5, M3 + x8 * M4);
+
+    double poly = FMA(x16, N2, N1);
+
+    if (xBig) {
+        const double signedPi = copysign(PI_2, x);
+
+        double result_d = FMA(-x2 * xReduced, poly, (signedPi - xReduced));
+
+        return result_d;
+    }
+
+    double result_d = FMA(x2 * xReduced, poly, xReduced);
+
+    return result_d;
+}
+
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+#if !(defined _CPU)
+#error: please define _CPU - specific suffix to a function name
+#endif
+
+#if !(defined _VL)
+#error: please define _VL - Number of elements per vector register
+#endif
+
+
+#include <immintrin.h>
+#define CONFIG 1
+#if ((_VL) == (2))
+#include "helperavx2_128.h"
+#elif ((_VL) == (4))
+#include "helperavx2.h"
+#elif ((_VL) == (8))
+#include "helperavx512f.h"
+#endif
+
+
+#define _JOIN4(a,b,c,d) a##b##c##d
+#define JOIN4(a,b,c,d) _JOIN4(a,b,c,d)
+
+#define atan_d_vec JOIN4(__fd_atan_,_VL,_,_CPU)
+
+extern "C" vdouble atan_d_vec(vdouble const);
+
+#include <atan_d_vec.h>