File tree Expand file tree Collapse file tree 2 files changed +24
-0
lines changed Expand file tree Collapse file tree 2 files changed +24
-0
lines changed Original file line number Diff line number Diff line change 1919 " Cawley" ,
2020 " cBLAS" ,
2121 " CCCL" ,
22+ " Chebyshev" ,
2223 " chriskohlhoffasio" ,
2324 " clflush" ,
2425 " colsb" ,
Original file line number Diff line number Diff line change @@ -1230,6 +1230,29 @@ BENCHMARK(f64_sin_maclaurin_with_fast_math)->Iterations(1e7);
12301230#pragma float_control(pop)
12311231#endif
12321232
1233+ /* *
1234+ * Alternatively, we can just manually rearrange the arithmetic operations
1235+ * using Horner's method for the same polynomial, which is only one of many
1236+ * possible approximation schemes.
1237+ *
1238+ * @see Horner's method: https://en.wikipedia.org/wiki/Horner%27s_method
1239+ * @see Chebyshev polynomials: https://en.wikipedia.org/wiki/Chebyshev_polynomials
1240+ */
1241+
1242+ static void f64_sin_maclaurin_with_horner (bm::State &state) {
1243+ double argument = -M_PI_2, step = M_PI / 1e7 , result = 0 ;
1244+ constexpr double reciprocal_6 = 1.0 / 6.0 ;
1245+ constexpr double reciprocal_120 = 1.0 / 120.0 ;
1246+ for (auto _ : state) {
1247+ argument += step;
1248+ result = argument * (1.0 - (argument * argument) * (reciprocal_6 - (argument * argument) * reciprocal_120));
1249+ bm::DoNotOptimize (result);
1250+ }
1251+ state.SetBytesProcessed (state.iterations () * sizeof (double ));
1252+ }
1253+
1254+ BENCHMARK (f64_sin_maclaurin_with_horner)->Iterations(1e7 );
1255+
12331256/* *
12341257 * Result: latency of @b 0.8ns - almost @b 40x faster than the standard
12351258 * on Intel, but on Arm the result remained unchanged, the same @b 1.1ns.
You can’t perform that action at this time.
0 commit comments