|
| 1 | +// RUN: %clangxx_cilksan -std=c++20 -fopencilk -O3 -g %s -o %t |
| 2 | +// RUN: %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,CILKSAN |
| 3 | +#include <array> |
| 4 | +#include <cstddef> |
| 5 | +#include <cstdint> |
| 6 | +#include <cstdio> |
| 7 | +#include <random> |
| 8 | + |
| 9 | +#include <cilk/cilk.h> |
| 10 | + |
| 11 | +struct Scalar { |
| 12 | + constexpr static uint64_t PRIME = 0xffffffff00000001ull; |
| 13 | + |
| 14 | + // Scalar() { /* Deliberately skip initialization of _raw to allow more |
| 15 | + // aggresive optimization */ |
| 16 | + // } |
| 17 | + |
| 18 | + Scalar() = default; |
| 19 | + |
| 20 | + explicit Scalar(uint64_t raw) : _raw{raw} {} |
| 21 | + |
| 22 | + explicit operator uint64_t() const { return _raw; } |
| 23 | + |
| 24 | + auto operator+(Scalar other) const -> Scalar { |
| 25 | + const uint64_t sum = _raw + other._raw; |
| 26 | + const Scalar ret{ |
| 27 | + sum < _raw || sum < other._raw || sum >= PRIME ? sum - PRIME : sum}; |
| 28 | + return ret; |
| 29 | + } |
| 30 | + |
| 31 | + auto operator-(Scalar other) const -> Scalar { |
| 32 | + const uint64_t diff = _raw - other._raw; |
| 33 | + const Scalar ret{(diff > _raw) ? diff + PRIME : diff}; |
| 34 | + return ret; |
| 35 | + } |
| 36 | + |
| 37 | + auto operator*(Scalar other) const -> Scalar { |
| 38 | + // Start by carrying out an ordinary 64x64->128 bit multiplication |
| 39 | + const uint32_t a0 = _raw, a1 = _raw >> 32; |
| 40 | + const uint32_t b0 = other._raw, b1 = other._raw >> 32; |
| 41 | + const uint64_t p0 = static_cast<uint64_t>(a0) * b0, |
| 42 | + p1 = static_cast<uint64_t>(a0) * b1, |
| 43 | + p2 = static_cast<uint64_t>(a1) * b0, |
| 44 | + p3 = static_cast<uint64_t>(a1) * b1; |
| 45 | + const uint32_t cy = ((p0 >> 32) + static_cast<uint32_t>(p1) + |
| 46 | + static_cast<uint32_t>(p2)) >> |
| 47 | + 32; |
| 48 | + const uint64_t x = p0 + (p1 << 32) + (p2 << 32), |
| 49 | + y = p3 + (p1 >> 32) + (p2 >> 32) + cy; |
| 50 | + // Store result in 4 32-bit words |
| 51 | + const uint32_t c0 = x, c1 = x >> 32, c2 = y, c3 = y >> 32; |
| 52 | + // Now perform reduction: modulus is phi^2 - phi + 1 where phi = 2^32 |
| 53 | + // ab = c0 + c1*phi + c2*phi^2 + c3*phi^3 |
| 54 | + // Exploit phi^2 = phi-1 and phi^3 = phi * (phi-1) = (phi-1) - phi = -1 |
| 55 | + // ab = c0 + c1*phi + c2*(phi-1) - c3 |
| 56 | + // = (c0-c2-c3) + (c1+c2)*phi |
| 57 | + const Scalar ret = (Scalar(c0) - Scalar(c2) - Scalar(c3)) + |
| 58 | + (Scalar(static_cast<uint64_t>(c1) << 32) + |
| 59 | + Scalar(static_cast<uint64_t>(c2) << 32)); |
| 60 | + return ret; |
| 61 | + } |
| 62 | + |
| 63 | + auto operator==(Scalar other) const -> bool { return _raw == other._raw; } |
| 64 | + |
| 65 | + auto operator!=(Scalar other) const -> bool { return _raw != other._raw; } |
| 66 | + |
| 67 | + // No comparison operators as they make no sense for finite fields |
| 68 | + |
| 69 | + auto operator+=(Scalar other) -> Scalar & { return *this = *this + other; } |
| 70 | + |
| 71 | + auto operator-=(Scalar other) -> Scalar & { return *this = *this - other; } |
| 72 | + |
| 73 | + auto operator*=(Scalar other) -> Scalar & { return *this = *this * other; } |
| 74 | + |
| 75 | + template <typename RNG> inline static auto random(RNG &rng) -> Scalar { |
| 76 | + static std::uniform_int_distribution<uint64_t> dist(0, PRIME - 1); |
| 77 | + return Scalar{dist(rng)}; |
| 78 | + } |
| 79 | + |
| 80 | + auto is_valid() const -> bool { return _raw < PRIME; } |
| 81 | + |
| 82 | + private: |
| 83 | + uint64_t _raw; |
| 84 | +}; |
| 85 | + |
| 86 | +static inline void zero_scalar(void *view) { |
| 87 | + *reinterpret_cast<Scalar *>(view) = Scalar{0}; |
| 88 | +} |
| 89 | + |
| 90 | +static inline void add_scalar(void *left, void *right) { |
| 91 | + *reinterpret_cast<Scalar *>(left) += *reinterpret_cast<Scalar *>(right); |
| 92 | +} |
| 93 | + |
| 94 | +using ScalarAddReducer = Scalar cilk_reducer(zero_scalar, add_scalar); |
| 95 | + |
| 96 | +auto reduce_with_cilk(Scalar **as, Scalar **bs, Scalar *c, Scalar *coeffs, |
| 97 | + size_t n, size_t m) -> std::array<Scalar, 3> { |
| 98 | + ScalarAddReducer p0{0}, p2{0}, p3{0}; |
| 99 | + cilk_for (size_t i = 0; i < n; i++) { |
| 100 | + // Obtain dense representations of the polynomials |
| 101 | + const Scalar *a = as[i]; |
| 102 | + const Scalar *b = bs[i]; |
| 103 | + const size_t half = m / 2; |
| 104 | + |
| 105 | + ScalarAddReducer lp0{0}, lp2{0}, lp3{0}; |
| 106 | + cilk_for (size_t j = 0; j < half; j++) { |
| 107 | + lp0 += a[j] * b[j] * c[j]; |
| 108 | + const Scalar a2 = a[j + half] + a[j + half] - a[j], |
| 109 | + b2 = b[j + half] + b[j + half] - b[j], |
| 110 | + c2 = c[j + half] + c[j + half] - c[j]; |
| 111 | + lp2 += a2 * b2 * c2; |
| 112 | + const Scalar a3 = a2 + a[j + half] - a[j], |
| 113 | + b3 = b2 + b[j + half] - b[j], |
| 114 | + c3 = c2 + c[j + half] - c[j]; |
| 115 | + lp3 += a3 * b3 * c3; |
| 116 | + } |
| 117 | + p0 += coeffs[i] * lp0; |
| 118 | + p2 += coeffs[i] * lp2; |
| 119 | + p3 += coeffs[i] * lp3; |
| 120 | + } |
| 121 | + return {p0, p2, p3}; |
| 122 | +} |
| 123 | + |
| 124 | +auto reduce_serial(Scalar **as, Scalar **bs, Scalar *c, Scalar *coeffs, |
| 125 | + size_t n, size_t m) -> std::array<Scalar, 3> { |
| 126 | + Scalar p0{0}, p2{0}, p3{0}; |
| 127 | + for (size_t i = 0; i < n; i++) { |
| 128 | + // Obtain dense representations of the polynomials |
| 129 | + const Scalar *a = as[i]; |
| 130 | + const Scalar *b = bs[i]; |
| 131 | + const size_t half = m / 2; |
| 132 | + |
| 133 | + Scalar lp0{0}, lp2{0}, lp3{0}; |
| 134 | + for (size_t j = 0; j < half; j++) { |
| 135 | + lp0 += a[j] * b[j] * c[j]; |
| 136 | + const Scalar a2 = a[j + half] + a[j + half] - a[j], |
| 137 | + b2 = b[j + half] + b[j + half] - b[j], |
| 138 | + c2 = c[j + half] + c[j + half] - c[j]; |
| 139 | + lp2 += a2 * b2 * c2; |
| 140 | + const Scalar a3 = a2 + a[j + half] - a[j], |
| 141 | + b3 = b2 + b[j + half] - b[j], |
| 142 | + c3 = c2 + c[j + half] - c[j]; |
| 143 | + lp3 += a3 * b3 * c3; |
| 144 | + } |
| 145 | + |
| 146 | + p0 += coeffs[i] * lp0; |
| 147 | + p2 += coeffs[i] * lp2; |
| 148 | + p3 += coeffs[i] * lp3; |
| 149 | + } |
| 150 | + return {p0, p2, p3}; |
| 151 | +} |
| 152 | + |
| 153 | +auto main() -> int { |
| 154 | + const size_t N = 12; |
| 155 | + const size_t M = 128; |
| 156 | + |
| 157 | + std::mt19937_64 rng{42}; |
| 158 | + |
| 159 | + Scalar *as[N]; |
| 160 | + Scalar *bs[N]; |
| 161 | + Scalar c[M]; |
| 162 | + Scalar coeffs[N]; |
| 163 | + |
| 164 | + for (size_t i = 0; i < N; i++) { |
| 165 | + as[i] = new Scalar[M]; |
| 166 | + bs[i] = new Scalar[M]; |
| 167 | + coeffs[i] = Scalar::random(rng); |
| 168 | + for (size_t j = 0; j < M; j++) { |
| 169 | + as[i][j] = Scalar::random(rng); |
| 170 | + bs[i][j] = Scalar::random(rng); |
| 171 | + } |
| 172 | + } |
| 173 | + for (size_t i = 0; i < M; i++) { |
| 174 | + c[i] = Scalar::random(rng); |
| 175 | + } |
| 176 | + |
| 177 | + const auto res_cilk = reduce_with_cilk(as, bs, c, coeffs, N, M); |
| 178 | + const auto res_serial = reduce_serial(as, bs, c, coeffs, N, M); |
| 179 | + if (res_cilk != res_serial) { |
| 180 | + printf("res_cilk = %lu, %lu, %lu\n", static_cast<uint64_t>(res_cilk[0]), |
| 181 | + static_cast<uint64_t>(res_cilk[1]), |
| 182 | + static_cast<uint64_t>(res_cilk[2])); |
| 183 | + printf("res_serial = %lu, %lu, %lu\n", |
| 184 | + static_cast<uint64_t>(res_serial[0]), |
| 185 | + static_cast<uint64_t>(res_serial[1]), |
| 186 | + static_cast<uint64_t>(res_serial[2])); |
| 187 | + } |
| 188 | + for (size_t i = 0; i < N; i++) { |
| 189 | + delete[] as[i]; |
| 190 | + delete[] bs[i]; |
| 191 | + } |
| 192 | + return 0; |
| 193 | +} |
| 194 | + |
| 195 | +// NOLINTEND |
| 196 | + |
| 197 | +// CHECK-NOT: res_cilk = |
| 198 | +// CHECK-NOT: res_serial = |
| 199 | + |
| 200 | +// CILKSAN: Cilksan detected 0 distinct races. |
| 201 | +// CILKSAN-NEXT: Cilksan suppressed 0 duplicate race reports. |
0 commit comments