@@ -135,6 +135,88 @@ void reverse_and_sort_with_cub(std::uint32_t *device_pointer, std::size_t array_
135135
136136#pragma region Numerics
137137
138+ /* *
139+ * @brief On-device @b Fused-Multiply-Add operator, that for most numeric
140+ * types will be replaced by a single PTX instruction on most GPUs.
141+ */
142+ struct fma_t {
143+ template <typename scalar_type_>
144+ inline __device__ scalar_type_ operator ()(scalar_type_ a, scalar_type_ b, scalar_type_ c) const noexcept {
145+ return c + a * b;
146+ }
147+ };
148+
149+ /* *
150+ * To benchmark matrix multiplications throughput we could start with
151+ * a traditional GEMM kernel, fetching data into shared memory, and then
152+ * running tiled mat-mul. That, however, may end up benchmarking the L2
153+ * throughput, rather than the ALUs on device. So we start with a simpler
154+ * kernel, that operates over small tiles of data already in shared memory.
155+ */
156+ template <typename input_type_, typename output_type_, int matrix_side_, int repetitions_,
157+ typename fma_operator_ = fma_t >
158+ __device__ void tops_fma_cuda_kernel () {
159+
160+ // In‑register arrays, all allocated as local variables
161+ input_type_ a_tile[matrix_side_][matrix_side_], b_tile[matrix_side_][matrix_side_];
162+ output_type_ c_tile[matrix_side_][matrix_side_];
163+
164+ // Repeatedly perform FMA-like operations
165+ fma_operator_ fma_operator;
166+ for (int r = 0 ; r < repetitions_; ++r) {
167+ for (int i = 0 ; i < matrix_side_; ++i)
168+ for (int j = 0 ; j < matrix_side_; ++j)
169+ for (int k = 0 ; k < matrix_side_; ++k)
170+ c_tile[i][j] = fma_operator (a_tile[i][k], b_tile[k][j], c_tile[i][j]);
171+ }
172+
173+ // Prevent dead-code elimination by writing one result out
174+ if (threadIdx .x == 0 && blockIdx .x == 0 ) {
175+ volatile output_type_ sink = c_tile[0 ][0 ]; // A dummy volatile store should be enough
176+ (void )sink;
177+ }
178+ }
179+
180+ __global__ void tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel () { tops_fma_cuda_kernel<float , float , 16 , 128 >(); }
181+
182+ __global__ void tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel () { tops_fma_cuda_kernel<double , double , 16 , 128 >(); }
183+
184+ __global__ void tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel () {
185+ #if (__CUDA_ARCH__ >= 700)
186+ struct f16_fma_t {
187+ inline __device__ half operator ()(half a, half b, half c) const noexcept { return __hfma (a, b, c); }
188+ };
189+ tops_fma_cuda_kernel<half, half, 16 , 128 , f16_fma_t >();
190+ #endif
191+ }
192+
193+ __global__ void tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel () {
194+ #if (__CUDA_ARCH__ >= 800)
195+ struct bf16_fma_t {
196+ inline __device__ __nv_bfloat16 operator ()(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) const noexcept {
197+ return __hfma (a, b, c);
198+ }
199+ };
200+ tops_fma_cuda_kernel<__nv_bfloat16, __nv_bfloat16, 16 , 128 , bf16_fma_t >();
201+ #endif
202+ }
203+
204+ /* *
205+ * Aside from floating-point numbers, similar operations are often performed
206+ * on integer inputs. If historically graphics cards struggled with those,
207+ * today they have outstanding performance and can be used in variety of
208+ * @b combinatorial problems from encryption and Ethereum mining to Graph
209+ * processing, Integer Programming, Bioinformatics, or more mainstream
210+ * @b AI-Inference of quantized models.
211+ */
212+ __global__ void tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel () {
213+ tops_fma_cuda_kernel<std::int32_t , std::int32_t , 16 , 128 >();
214+ }
215+
216+ __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel () {
217+ tops_fma_cuda_kernel<std::int64_t , std::int64_t , 16 , 128 >();
218+ }
219+
138220/* *
139221 * Starting with Nvidia Volta GPUs, specialized "Tensor Cores" @b (TC) are
140222 * added for faster matrix multiplications. These Tensor Cores are much faster
0 commit comments