@@ -33,9 +33,9 @@ struct ThreadData {
3333 std::vector<double , xsimd::aligned_allocator<double >> flush_buffer;
3434};
3535
36- std::size_t simd_width () { return batch_type::size; }
36+ constexpr std::size_t simd_width () { return batch_type::size; }
3737
38- std::string_view trim (std::string_view value) {
38+ std::string_view trim (std::string_view value) {
3939 while (!value.empty () &&
4040 std::isspace (static_cast <unsigned char >(value.front ()))) {
4141 value.remove_prefix (1 );
@@ -246,16 +246,19 @@ std::string format_megabytes(std::size_t bytes) {
246246 return oss.str ();
247247}
248248
249- double finalize_thread_sums (const std::vector<batch_type> &lane_sums,
250- const std::vector<double > &tail_sums) {
251- batch_type total_batch (0.0 );
252- for (const auto &lane : lane_sums) { total_batch += lane; }
249+ // ---------- SINK to prevent DCE without volatile ----------
253250
254- double total = xsimd::reduce_add (total_batch);
255- for (double tail : tail_sums) { total += tail; }
256-
257- return total;
251+ #if defined(__clang__) || defined(__GNUG__)
252+ [[gnu::noinline]] static void sink (batch_type v) {
253+ asm volatile (" " : : " x" (v));
258254}
255+ #else
256+ // Fallback: do a trivial reduction (kept minimal) if inline asm isn't
257+ // supported.
258+ static void sink (batch_type v) { (void )xsimd::hadd (v); }
259+ #endif
260+
261+ // ----------------------------------------------------------
259262
260263void flush_caches_for_threads (std::vector<ThreadData> &threads,
261264 std::vector<double > &flush_sums) {
@@ -272,135 +275,121 @@ void flush_caches_for_threads(std::vector<ThreadData> &threads,
272275 }
273276}
274277
278+ [[gnu::noinline]]
275279void aligned_read_kernel (const std::vector<ThreadData> &threads,
276- std::size_t elements_per_thread,
277- std::vector<batch_type> &lane_sums,
278- std::vector<double > &tail_sums) {
280+ std::size_t elements_per_thread) {
279281 const std::size_t width = simd_width ();
280282
281283#pragma omp parallel
282284 {
283285 const int tid = omp_get_thread_num ();
284286 const double *data = threads[tid].source .data ();
285-
286- batch_type batch_sum ( 0.0 ) ;
287- std:: size_t index = 0 ;
288- for (; index + width <= elements_per_thread; index += width) {
289- batch_sum += batch_type::load_aligned (data + index );
287+ asm volatile ( " " ::: " memory " );
288+ for (std:: size_t index = 0 ; index + width <= elements_per_thread ;
289+ index += width) {
290+ auto v = batch_type::load_aligned (data + index);
291+ sink (v );
290292 }
291-
292- double tail = 0.0 ;
293- for (; index < elements_per_thread; ++index) { tail += data[index]; }
294-
295- lane_sums[tid] = batch_sum;
296- tail_sums[tid] = tail;
293+ asm volatile (" " ::: " memory" );
297294 }
298295}
299296
297+ [[gnu::noinline]]
300298void unaligned_read_kernel (const std::vector<ThreadData> &threads,
301- std::size_t elements_per_thread,
302- std::vector<batch_type> &lane_sums,
303- std::vector<double > &tail_sums) {
299+ std::size_t elements_per_thread) {
304300 const std::size_t width = simd_width ();
305301
306302#pragma omp parallel
307303 {
308304 const int tid = omp_get_thread_num ();
309- const double *data = threads[tid].source .data () + 1 ;
310-
311- batch_type batch_sum ( 0.0 ) ;
312- std:: size_t index = 0 ;
313- for (; index + width <= elements_per_thread; index += width) {
314- batch_sum += batch_type::load_unaligned (data + index );
305+ const double *data = threads[tid].source .data () + 1 ; // misalign
306+ asm volatile ( " " ::: " memory " );
307+ for (std:: size_t index = 0 ; index + width <= elements_per_thread ;
308+ index += width) {
309+ auto v = batch_type::load_unaligned (data + index);
310+ sink (v );
315311 }
316-
317- double tail = 0.0 ;
318- for (; index < elements_per_thread; ++index) { tail += data[index]; }
319-
320- lane_sums[tid] = batch_sum;
321- tail_sums[tid] = tail;
312+ asm volatile (" " ::: " memory" );
322313 }
323314}
324315
316+ [[gnu::noinline]]
325317void aligned_write_kernel (std::vector<ThreadData> &threads,
326318 std::size_t elements_per_thread, double value) {
327319 const std::size_t width = simd_width ();
328320 const batch_type value_batch (value);
329321
330322#pragma omp parallel
331323 {
332- const int tid = omp_get_thread_num ();
333- double *data = threads[tid].write_target .data ();
334-
335- std::size_t index = 0 ;
336- for (; index + width <= elements_per_thread; index += width) {
324+ const int tid = omp_get_thread_num ();
325+ double *data = threads[tid].write_target .data ();
326+ asm volatile ( " " ::: " memory " );
327+ for ( std::size_t index = 0 ; index + width <= elements_per_thread ;
328+ index += width) {
337329 value_batch.store_aligned (data + index);
338330 }
339-
340- for (; index < elements_per_thread; ++index) { data[index] = value; }
331+ asm volatile (" " ::: " memory" );
341332 }
342333}
343334
335+ [[gnu::noinline]]
344336void unaligned_write_kernel (std::vector<ThreadData> &threads,
345337 std::size_t elements_per_thread, double value) {
346338 const std::size_t width = simd_width ();
347339 const batch_type value_batch (value);
348340
349341#pragma omp parallel
350342 {
351- const int tid = omp_get_thread_num ();
352- double *data = threads[tid].write_target .data () + 1 ;
353-
354- std::size_t index = 0 ;
355- for (; index + width <= elements_per_thread; index += width) {
343+ const int tid = omp_get_thread_num ();
344+ double *data = threads[tid].write_target .data () + 1 ; // misalign
345+ asm volatile ( " " ::: " memory " );
346+ for ( std::size_t index = 0 ; index + width <= elements_per_thread ;
347+ index += width) {
356348 value_batch.store_unaligned (data + index);
357349 }
358-
359- for (; index < elements_per_thread; ++index) { data[index] = value; }
350+ asm volatile (" " ::: " memory" );
360351 }
361352}
362353
354+ [[gnu::noinline]]
363355void aligned_copy_kernel (std::vector<ThreadData> &threads,
364356 std::size_t elements_per_thread) {
365357 const std::size_t width = simd_width ();
366358
367359#pragma omp parallel
368360 {
369- const int tid = omp_get_thread_num ();
370- double *dst = threads[tid].copy_target .data ();
371- const double *src = threads[tid].source .data ();
372-
373- std::size_t index = 0 ;
374- for (; index + width <= elements_per_thread; index += width) {
375- batch_type values = batch_type::load_aligned (src + index);
376- values.store_aligned (dst + index);
377- }
378-
379- for (; index < elements_per_thread; ++index) {
380- dst[index] = src[index];
361+ const int tid = omp_get_thread_num ();
362+ double *dst = threads[tid].copy_target .data ();
363+ const double *src = threads[tid].source .data ();
364+
365+ asm volatile (" " ::: " memory" );
366+ for (std::size_t index = 0 ; index + width <= elements_per_thread;
367+ index += width) {
368+ batch_type v = batch_type::load_aligned (src + index);
369+ v.store_aligned (dst + index);
381370 }
371+ asm volatile (" " ::: " memory" );
382372 }
383373}
384374
375+ [[gnu::noinline]]
385376void unaligned_copy_kernel (std::vector<ThreadData> &threads,
386377 std::size_t elements_per_thread) {
387378 const std::size_t width = simd_width ();
388379
389380#pragma omp parallel
390381 {
391- const int tid = omp_get_thread_num ();
392- double *dst = threads[tid].copy_target .data () + 1 ;
393- const double *src = threads[tid].source .data () + 1 ;
394-
395- std::size_t index = 0 ;
396- for (; index + width <= elements_per_thread; index += width) {
397- batch_type values = batch_type::load_unaligned (src + index);
398- values.store_unaligned (dst + index);
399- }
400-
401- for (; index < elements_per_thread; ++index) {
402- dst[index] = src[index];
382+ const int tid = omp_get_thread_num ();
383+ double *dst = threads[tid].copy_target .data () + 1 ; // misalign
384+ const double *src = threads[tid].source .data () + 1 ; // misalign
385+
386+ asm volatile (" " ::: " memory" );
387+ for (std::size_t index = 0 ; index + width <= elements_per_thread;
388+ index += width) {
389+ batch_type v = batch_type::load_unaligned (src + index);
390+ v.store_unaligned (dst + index);
403391 }
392+ asm volatile (" " ::: " memory" );
404393 }
405394}
406395
@@ -461,7 +450,6 @@ int main(int argc, char **argv) {
461450
462451 omp_set_num_threads (options.threads );
463452 const int thread_count = omp_get_max_threads ();
464-
465453 const std::size_t width = simd_width ();
466454
467455 std::size_t requested_bytes = options.target_bytes ;
@@ -486,9 +474,10 @@ int main(int argc, char **argv) {
486474 elements_per_thread, 8ULL * 1024ULL * 1024ULL / sizeof (double ));
487475
488476 std::vector<ThreadData> thread_data (static_cast <std::size_t >(thread_count));
489- #pragma omp parallel for
490- for (int tid = 0 ; tid < thread_count; ++tid) {
491- auto &data = thread_data[tid];
477+ #pragma omp parallel
478+ {
479+ const auto tid = omp_get_thread_num ();
480+ auto &data = thread_data[tid];
492481 data.source .resize (elements_per_thread + width);
493482 data.write_target .resize (elements_per_thread + width);
494483 data.copy_target .resize (elements_per_thread + width);
@@ -501,9 +490,15 @@ int main(int argc, char **argv) {
501490 std::iota (data.flush_buffer .begin (), data.flush_buffer .end (), 1.0 );
502491 }
503492
504- const std::size_t bytes_per_iteration =
505- per_thread_bytes * static_cast <std::size_t >(thread_count);
506- const std::size_t copy_bytes_per_iteration = bytes_per_iteration * 2 ;
493+ // bytes actually processed (only full SIMD blocks)
494+ const std::size_t full_blocks_per_thread =
495+ (elements_per_thread / width) * width;
496+ const std::size_t processed_bytes_per_thread =
497+ full_blocks_per_thread * sizeof (double );
498+ const std::size_t processed_bytes_all_threads =
499+ processed_bytes_per_thread * static_cast <std::size_t >(thread_count);
500+ const std::size_t processed_copy_bytes_all_threads =
501+ processed_bytes_all_threads * 2 ;
507502
508503 std::cout << " Requested size: " << format_megabytes (options.target_bytes )
509504 << " (" << options.target_bytes << " bytes)" << std::endl;
@@ -521,11 +516,7 @@ int main(int argc, char **argv) {
521516 std::cout << " SIMD width: " << width << " doubles" << std::endl;
522517 std::cout << " Iterations per test: " << options.iterations << std::endl;
523518
524- std::vector<batch_type> lane_sums (static_cast <std::size_t >(thread_count));
525- std::vector<double > tail_sums (static_cast <std::size_t >(thread_count), 0.0 );
526519 std::vector<double > flush_sums (static_cast <std::size_t >(thread_count), 0.0 );
527-
528- volatile double read_sink = 0.0 ;
529520 volatile double flush_sink = 0.0 ;
530521
531522 auto flush = [&] {
@@ -535,60 +526,45 @@ int main(int argc, char **argv) {
535526 flush_sink = total;
536527 };
537528
538- auto finalize_read = [&] {
539- read_sink = finalize_thread_sums (lane_sums, tail_sums);
540- };
541-
542529 auto aligned_read_timed = [&] {
543- for (auto &lane : lane_sums) { lane = batch_type (0.0 ); }
544- std::fill (tail_sums.begin (), tail_sums.end (), 0.0 );
545- aligned_read_kernel (thread_data, elements_per_thread, lane_sums,
546- tail_sums);
530+ aligned_read_kernel (thread_data, elements_per_thread);
547531 };
548-
549- run_benchmark (" Aligned read" , bytes_per_iteration, options.iterations ,
550- flush, aligned_read_timed, finalize_read);
551-
552532 auto unaligned_read_timed = [&] {
553- for (auto &lane : lane_sums) { lane = batch_type (0.0 ); }
554- std::fill (tail_sums.begin (), tail_sums.end (), 0.0 );
555- unaligned_read_kernel (thread_data, elements_per_thread, lane_sums,
556- tail_sums);
533+ unaligned_read_kernel (thread_data, elements_per_thread);
557534 };
558-
559- run_benchmark (" Unaligned read" , bytes_per_iteration, options.iterations ,
560- flush, unaligned_read_timed, finalize_read);
561-
562535 auto aligned_write_timed = [&] {
563536 aligned_write_kernel (thread_data, elements_per_thread, 1.0 );
564537 };
565-
566- run_benchmark (" Aligned write" , bytes_per_iteration, options.iterations ,
567- flush, aligned_write_timed, [] {});
568-
569538 auto unaligned_write_timed = [&] {
570539 unaligned_write_kernel (thread_data, elements_per_thread, 1.0 );
571540 };
572-
573- run_benchmark (" Unaligned write" , bytes_per_iteration, options.iterations ,
574- flush, unaligned_write_timed, [] {});
575-
576541 auto aligned_copy_timed = [&] {
577542 aligned_copy_kernel (thread_data, elements_per_thread);
578543 };
579-
580- run_benchmark (" Aligned copy" , copy_bytes_per_iteration, options.iterations ,
581- flush, aligned_copy_timed, [] {});
582-
583544 auto unaligned_copy_timed = [&] {
584545 unaligned_copy_kernel (thread_data, elements_per_thread);
585546 };
586547
587- run_benchmark (" Unaligned copy" , copy_bytes_per_iteration,
548+ std::cout << " Measured bytes per iteration (reads/writes use full SIMD "
549+ " blocks only): "
550+ << processed_bytes_all_threads
551+ << " B, copies: " << processed_copy_bytes_all_threads << " B"
552+ << std::endl;
553+
554+ run_benchmark (" Aligned read" , processed_bytes_all_threads,
555+ options.iterations , flush, aligned_read_timed, [] {});
556+ run_benchmark (" Unaligned read" , processed_bytes_all_threads,
557+ options.iterations , flush, unaligned_read_timed, [] {});
558+ run_benchmark (" Aligned write" , processed_bytes_all_threads,
559+ options.iterations , flush, aligned_write_timed, [] {});
560+ run_benchmark (" Unaligned write" , processed_bytes_all_threads,
561+ options.iterations , flush, unaligned_write_timed, [] {});
562+ run_benchmark (" Aligned copy" , processed_copy_bytes_all_threads,
563+ options.iterations , flush, aligned_copy_timed, [] {});
564+ run_benchmark (" Unaligned copy" , processed_copy_bytes_all_threads,
588565 options.iterations , flush, unaligned_copy_timed, [] {});
589566
590567 std::ofstream dev_null (" /dev/null" );
591- dev_null << " Read sink (ignore): " << read_sink << std::endl;
592568 dev_null << " Flush sink (ignore): " << flush_sink << std::endl;
593569 return 0 ;
594570}
0 commit comments