Skip to content

Commit cab8824

Browse files
committed
Improve: Horner method
1 parent 56016d5 commit cab8824

File tree

2 files changed

+24
-0
lines changed

2 files changed

+24
-0
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"Cawley",
2020
"cBLAS",
2121
"CCCL",
22+
"Chebyshev",
2223
"chriskohlhoffasio",
2324
"clflush",
2425
"colsb",

less_slow.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,6 +1230,29 @@ BENCHMARK(f64_sin_maclaurin_with_fast_math)->Iterations(1e7);
12301230
#pragma float_control(pop)
12311231
#endif
12321232

1233+
/**
1234+
* Alternatively, we can just manually rearrange the arithmetic operations
1235+
* using Horner's method for the same polynomial, which is only one of many
1236+
* possible approximation schemes.
1237+
*
1238+
* @see Horner's method: https://en.wikipedia.org/wiki/Horner%27s_method
1239+
* @see Chebyshev polynomials: https://en.wikipedia.org/wiki/Chebyshev_polynomials
1240+
*/
1241+
1242+
static void f64_sin_maclaurin_with_horner(bm::State &state) {
1243+
double argument = -M_PI_2, step = M_PI / 1e7, result = 0;
1244+
constexpr double reciprocal_6 = 1.0 / 6.0;
1245+
constexpr double reciprocal_120 = 1.0 / 120.0;
1246+
for (auto _ : state) {
1247+
argument += step;
1248+
result = argument * (1.0 - (argument * argument) * (reciprocal_6 - (argument * argument) * reciprocal_120));
1249+
bm::DoNotOptimize(result);
1250+
}
1251+
state.SetBytesProcessed(state.iterations() * sizeof(double));
1252+
}
1253+
1254+
BENCHMARK(f64_sin_maclaurin_with_horner)->Iterations(1e7);
1255+
12331256
/**
12341257
* Result: latency of @b 0.8ns - almost @b 40x faster than the standard
12351258
* on Intel, but on Arm the result remained unchanged, the same @b 1.1ns.

0 commit comments

Comments
 (0)