Skip to content

Commit 8e043e7

Browse files
committed
making asm readable
1 parent 9b96bf2 commit 8e043e7

File tree

1 file changed

+98
-122
lines changed

1 file changed

+98
-122
lines changed

bandwidth.cpp

Lines changed: 98 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ struct ThreadData {
3333
std::vector<double, xsimd::aligned_allocator<double>> flush_buffer;
3434
};
3535

36-
std::size_t simd_width() { return batch_type::size; }
36+
constexpr std::size_t simd_width() { return batch_type::size; }
3737

38-
std::string_view trim(std::string_view value) {
38+
std::string_view trim(std::string_view value) {
3939
while (!value.empty() &&
4040
std::isspace(static_cast<unsigned char>(value.front()))) {
4141
value.remove_prefix(1);
@@ -246,16 +246,19 @@ std::string format_megabytes(std::size_t bytes) {
246246
return oss.str();
247247
}
248248

249-
double finalize_thread_sums(const std::vector<batch_type> &lane_sums,
250-
const std::vector<double> &tail_sums) {
251-
batch_type total_batch(0.0);
252-
for (const auto &lane : lane_sums) { total_batch += lane; }
249+
// ---------- SINK to prevent DCE without volatile ----------
253250

254-
double total = xsimd::reduce_add(total_batch);
255-
for (double tail : tail_sums) { total += tail; }
256-
257-
return total;
251+
#if defined(__clang__) || defined(__GNUG__)
252+
[[gnu::noinline]] static void sink(batch_type v) {
253+
asm volatile("" : : "x"(v));
258254
}
255+
#else
256+
// Fallback: do a trivial reduction (kept minimal) if inline asm isn't
257+
// supported.
258+
static void sink(batch_type v) { (void)xsimd::hadd(v); }
259+
#endif
260+
261+
// ----------------------------------------------------------
259262

260263
void flush_caches_for_threads(std::vector<ThreadData> &threads,
261264
std::vector<double> &flush_sums) {
@@ -272,135 +275,121 @@ void flush_caches_for_threads(std::vector<ThreadData> &threads,
272275
}
273276
}
274277

278+
[[gnu::noinline]]
275279
void aligned_read_kernel(const std::vector<ThreadData> &threads,
276-
std::size_t elements_per_thread,
277-
std::vector<batch_type> &lane_sums,
278-
std::vector<double> &tail_sums) {
280+
std::size_t elements_per_thread) {
279281
const std::size_t width = simd_width();
280282

281283
#pragma omp parallel
282284
{
283285
const int tid = omp_get_thread_num();
284286
const double *data = threads[tid].source.data();
285-
286-
batch_type batch_sum(0.0);
287-
std::size_t index = 0;
288-
for (; index + width <= elements_per_thread; index += width) {
289-
batch_sum += batch_type::load_aligned(data + index);
287+
asm volatile("" ::: "memory");
288+
for (std::size_t index = 0; index + width <= elements_per_thread;
289+
index += width) {
290+
auto v = batch_type::load_aligned(data + index);
291+
sink(v);
290292
}
291-
292-
double tail = 0.0;
293-
for (; index < elements_per_thread; ++index) { tail += data[index]; }
294-
295-
lane_sums[tid] = batch_sum;
296-
tail_sums[tid] = tail;
293+
asm volatile("" ::: "memory");
297294
}
298295
}
299296

297+
[[gnu::noinline]]
300298
void unaligned_read_kernel(const std::vector<ThreadData> &threads,
301-
std::size_t elements_per_thread,
302-
std::vector<batch_type> &lane_sums,
303-
std::vector<double> &tail_sums) {
299+
std::size_t elements_per_thread) {
304300
const std::size_t width = simd_width();
305301

306302
#pragma omp parallel
307303
{
308304
const int tid = omp_get_thread_num();
309-
const double *data = threads[tid].source.data() + 1;
310-
311-
batch_type batch_sum(0.0);
312-
std::size_t index = 0;
313-
for (; index + width <= elements_per_thread; index += width) {
314-
batch_sum += batch_type::load_unaligned(data + index);
305+
const double *data = threads[tid].source.data() + 1; // misalign
306+
asm volatile("" ::: "memory");
307+
for (std::size_t index = 0; index + width <= elements_per_thread;
308+
index += width) {
309+
auto v = batch_type::load_unaligned(data + index);
310+
sink(v);
315311
}
316-
317-
double tail = 0.0;
318-
for (; index < elements_per_thread; ++index) { tail += data[index]; }
319-
320-
lane_sums[tid] = batch_sum;
321-
tail_sums[tid] = tail;
312+
asm volatile("" ::: "memory");
322313
}
323314
}
324315

316+
[[gnu::noinline]]
325317
void aligned_write_kernel(std::vector<ThreadData> &threads,
326318
std::size_t elements_per_thread, double value) {
327319
const std::size_t width = simd_width();
328320
const batch_type value_batch(value);
329321

330322
#pragma omp parallel
331323
{
332-
const int tid = omp_get_thread_num();
333-
double *data = threads[tid].write_target.data();
334-
335-
std::size_t index = 0;
336-
for (; index + width <= elements_per_thread; index += width) {
324+
const int tid = omp_get_thread_num();
325+
double *data = threads[tid].write_target.data();
326+
asm volatile("" ::: "memory");
327+
for (std::size_t index = 0; index + width <= elements_per_thread;
328+
index += width) {
337329
value_batch.store_aligned(data + index);
338330
}
339-
340-
for (; index < elements_per_thread; ++index) { data[index] = value; }
331+
asm volatile("" ::: "memory");
341332
}
342333
}
343334

335+
[[gnu::noinline]]
344336
void unaligned_write_kernel(std::vector<ThreadData> &threads,
345337
std::size_t elements_per_thread, double value) {
346338
const std::size_t width = simd_width();
347339
const batch_type value_batch(value);
348340

349341
#pragma omp parallel
350342
{
351-
const int tid = omp_get_thread_num();
352-
double *data = threads[tid].write_target.data() + 1;
353-
354-
std::size_t index = 0;
355-
for (; index + width <= elements_per_thread; index += width) {
343+
const int tid = omp_get_thread_num();
344+
double *data = threads[tid].write_target.data() + 1; // misalign
345+
asm volatile("" ::: "memory");
346+
for (std::size_t index = 0; index + width <= elements_per_thread;
347+
index += width) {
356348
value_batch.store_unaligned(data + index);
357349
}
358-
359-
for (; index < elements_per_thread; ++index) { data[index] = value; }
350+
asm volatile("" ::: "memory");
360351
}
361352
}
362353

354+
[[gnu::noinline]]
363355
void aligned_copy_kernel(std::vector<ThreadData> &threads,
364356
std::size_t elements_per_thread) {
365357
const std::size_t width = simd_width();
366358

367359
#pragma omp parallel
368360
{
369-
const int tid = omp_get_thread_num();
370-
double *dst = threads[tid].copy_target.data();
371-
const double *src = threads[tid].source.data();
372-
373-
std::size_t index = 0;
374-
for (; index + width <= elements_per_thread; index += width) {
375-
batch_type values = batch_type::load_aligned(src + index);
376-
values.store_aligned(dst + index);
377-
}
378-
379-
for (; index < elements_per_thread; ++index) {
380-
dst[index] = src[index];
361+
const int tid = omp_get_thread_num();
362+
double *dst = threads[tid].copy_target.data();
363+
const double *src = threads[tid].source.data();
364+
365+
asm volatile("" ::: "memory");
366+
for (std::size_t index = 0; index + width <= elements_per_thread;
367+
index += width) {
368+
batch_type v = batch_type::load_aligned(src + index);
369+
v.store_aligned(dst + index);
381370
}
371+
asm volatile("" ::: "memory");
382372
}
383373
}
384374

375+
[[gnu::noinline]]
385376
void unaligned_copy_kernel(std::vector<ThreadData> &threads,
386377
std::size_t elements_per_thread) {
387378
const std::size_t width = simd_width();
388379

389380
#pragma omp parallel
390381
{
391-
const int tid = omp_get_thread_num();
392-
double *dst = threads[tid].copy_target.data() + 1;
393-
const double *src = threads[tid].source.data() + 1;
394-
395-
std::size_t index = 0;
396-
for (; index + width <= elements_per_thread; index += width) {
397-
batch_type values = batch_type::load_unaligned(src + index);
398-
values.store_unaligned(dst + index);
399-
}
400-
401-
for (; index < elements_per_thread; ++index) {
402-
dst[index] = src[index];
382+
const int tid = omp_get_thread_num();
383+
double *dst = threads[tid].copy_target.data() + 1; // misalign
384+
const double *src = threads[tid].source.data() + 1; // misalign
385+
386+
asm volatile("" ::: "memory");
387+
for (std::size_t index = 0; index + width <= elements_per_thread;
388+
index += width) {
389+
batch_type v = batch_type::load_unaligned(src + index);
390+
v.store_unaligned(dst + index);
403391
}
392+
asm volatile("" ::: "memory");
404393
}
405394
}
406395

@@ -461,7 +450,6 @@ int main(int argc, char **argv) {
461450

462451
omp_set_num_threads(options.threads);
463452
const int thread_count = omp_get_max_threads();
464-
465453
const std::size_t width = simd_width();
466454

467455
std::size_t requested_bytes = options.target_bytes;
@@ -486,9 +474,10 @@ int main(int argc, char **argv) {
486474
elements_per_thread, 8ULL * 1024ULL * 1024ULL / sizeof(double));
487475

488476
std::vector<ThreadData> thread_data(static_cast<std::size_t>(thread_count));
489-
#pragma omp parallel for
490-
for (int tid = 0; tid < thread_count; ++tid) {
491-
auto &data = thread_data[tid];
477+
#pragma omp parallel
478+
{
479+
const auto tid = omp_get_thread_num();
480+
auto &data = thread_data[tid];
492481
data.source.resize(elements_per_thread + width);
493482
data.write_target.resize(elements_per_thread + width);
494483
data.copy_target.resize(elements_per_thread + width);
@@ -501,9 +490,15 @@ int main(int argc, char **argv) {
501490
std::iota(data.flush_buffer.begin(), data.flush_buffer.end(), 1.0);
502491
}
503492

504-
const std::size_t bytes_per_iteration =
505-
per_thread_bytes * static_cast<std::size_t>(thread_count);
506-
const std::size_t copy_bytes_per_iteration = bytes_per_iteration * 2;
493+
// bytes actually processed (only full SIMD blocks)
494+
const std::size_t full_blocks_per_thread =
495+
(elements_per_thread / width) * width;
496+
const std::size_t processed_bytes_per_thread =
497+
full_blocks_per_thread * sizeof(double);
498+
const std::size_t processed_bytes_all_threads =
499+
processed_bytes_per_thread * static_cast<std::size_t>(thread_count);
500+
const std::size_t processed_copy_bytes_all_threads =
501+
processed_bytes_all_threads * 2;
507502

508503
std::cout << "Requested size: " << format_megabytes(options.target_bytes)
509504
<< " (" << options.target_bytes << " bytes)" << std::endl;
@@ -521,11 +516,7 @@ int main(int argc, char **argv) {
521516
std::cout << "SIMD width: " << width << " doubles" << std::endl;
522517
std::cout << "Iterations per test: " << options.iterations << std::endl;
523518

524-
std::vector<batch_type> lane_sums(static_cast<std::size_t>(thread_count));
525-
std::vector<double> tail_sums(static_cast<std::size_t>(thread_count), 0.0);
526519
std::vector<double> flush_sums(static_cast<std::size_t>(thread_count), 0.0);
527-
528-
volatile double read_sink = 0.0;
529520
volatile double flush_sink = 0.0;
530521

531522
auto flush = [&] {
@@ -535,60 +526,45 @@ int main(int argc, char **argv) {
535526
flush_sink = total;
536527
};
537528

538-
auto finalize_read = [&] {
539-
read_sink = finalize_thread_sums(lane_sums, tail_sums);
540-
};
541-
542529
auto aligned_read_timed = [&] {
543-
for (auto &lane : lane_sums) { lane = batch_type(0.0); }
544-
std::fill(tail_sums.begin(), tail_sums.end(), 0.0);
545-
aligned_read_kernel(thread_data, elements_per_thread, lane_sums,
546-
tail_sums);
530+
aligned_read_kernel(thread_data, elements_per_thread);
547531
};
548-
549-
run_benchmark("Aligned read", bytes_per_iteration, options.iterations,
550-
flush, aligned_read_timed, finalize_read);
551-
552532
auto unaligned_read_timed = [&] {
553-
for (auto &lane : lane_sums) { lane = batch_type(0.0); }
554-
std::fill(tail_sums.begin(), tail_sums.end(), 0.0);
555-
unaligned_read_kernel(thread_data, elements_per_thread, lane_sums,
556-
tail_sums);
533+
unaligned_read_kernel(thread_data, elements_per_thread);
557534
};
558-
559-
run_benchmark("Unaligned read", bytes_per_iteration, options.iterations,
560-
flush, unaligned_read_timed, finalize_read);
561-
562535
auto aligned_write_timed = [&] {
563536
aligned_write_kernel(thread_data, elements_per_thread, 1.0);
564537
};
565-
566-
run_benchmark("Aligned write", bytes_per_iteration, options.iterations,
567-
flush, aligned_write_timed, [] {});
568-
569538
auto unaligned_write_timed = [&] {
570539
unaligned_write_kernel(thread_data, elements_per_thread, 1.0);
571540
};
572-
573-
run_benchmark("Unaligned write", bytes_per_iteration, options.iterations,
574-
flush, unaligned_write_timed, [] {});
575-
576541
auto aligned_copy_timed = [&] {
577542
aligned_copy_kernel(thread_data, elements_per_thread);
578543
};
579-
580-
run_benchmark("Aligned copy", copy_bytes_per_iteration, options.iterations,
581-
flush, aligned_copy_timed, [] {});
582-
583544
auto unaligned_copy_timed = [&] {
584545
unaligned_copy_kernel(thread_data, elements_per_thread);
585546
};
586547

587-
run_benchmark("Unaligned copy", copy_bytes_per_iteration,
548+
std::cout << "Measured bytes per iteration (reads/writes use full SIMD "
549+
"blocks only): "
550+
<< processed_bytes_all_threads
551+
<< " B, copies: " << processed_copy_bytes_all_threads << " B"
552+
<< std::endl;
553+
554+
run_benchmark("Aligned read", processed_bytes_all_threads,
555+
options.iterations, flush, aligned_read_timed, [] {});
556+
run_benchmark("Unaligned read", processed_bytes_all_threads,
557+
options.iterations, flush, unaligned_read_timed, [] {});
558+
run_benchmark("Aligned write", processed_bytes_all_threads,
559+
options.iterations, flush, aligned_write_timed, [] {});
560+
run_benchmark("Unaligned write", processed_bytes_all_threads,
561+
options.iterations, flush, unaligned_write_timed, [] {});
562+
run_benchmark("Aligned copy", processed_copy_bytes_all_threads,
563+
options.iterations, flush, aligned_copy_timed, [] {});
564+
run_benchmark("Unaligned copy", processed_copy_bytes_all_threads,
588565
options.iterations, flush, unaligned_copy_timed, [] {});
589566

590567
std::ofstream dev_null("/dev/null");
591-
dev_null << "Read sink (ignore): " << read_sink << std::endl;
592568
dev_null << "Flush sink (ignore): " << flush_sink << std::endl;
593569
return 0;
594570
}

0 commit comments

Comments
 (0)