1919#define xxpermdi (XT, XA, XB, IM ) asm (" xxpermdi " #XT " ," #XA " ," #XB " ," #IM)
2020#define xxmfacc (XT ) asm (" xxmfacc " #XT)
2121#define xvadddp (XT, XA, XB ) asm (" xvadddp " #XT " ," #XA " ," #XB)
22+ #define nop () asm (" nop " )
2223
2324void memory_load_1KiB
2425(
@@ -27,6 +28,7 @@ void memory_load_1KiB
2728)
2829{
2930 asm (" mtctr 4" );
31+ asm (" nop" );
3032 asm (" LOOP17:" );
3133
3234 lxvp ( 0 + 0 , 3 , 0 + 0 ); lxvp ( 0 + 2 , 3 , 32 + 0 );
@@ -83,8 +85,7 @@ double run_kernel
8385#pragma omp parallel for
8486 for (int j=0 ; j<nthreads; j++)
8587 {
86- uint32_t reps = count;
87- for (; reps; reps--)
88+ for (uint32_t reps = count; reps; reps--)
8889 {
8990 kernel (A[j], r);
9091 }
@@ -112,7 +113,7 @@ void run_kernel_and_report
112113 volatile double GB = (1.0e-09 )*N*sizeof (double )*count*omp_get_max_threads ();
113114 elapsed = run_kernel (kernel, count, N, n);
114115 std::cout << std::setprecision (6 );
115- std::cout << " Time to run " << std::setw (30 ) << name << " (" << std::setw (9 ) << N << " ) " << std::setw (6 ) << count << " times = " << std::setw (10 ) << std::fixed << elapsed << " seconds (" << std::setw (10 ) << std::scientific << GB/elapsed << " GB/s)" << std::endl;
116+ std::cout << " Time to run " << std::setw (30 ) << name << " (" << std::setw (9 ) << N << " doubles ) " << std::setw (9 ) << count << " times = " << std::setw (10 ) << std::fixed << elapsed << " seconds (" << std::setw (10 ) << std::scientific << GB/elapsed << " GB/s)" << std::endl;
116117}
117118
118119#define RUN_KERNEL (kernel, count, N, n ) run_kernel_and_report(kernel, count, #kernel, N, n)
@@ -127,8 +128,9 @@ int main
127128
128129 volatile double elapsed;
129130
131+ int nthreads = omp_get_max_threads ();
130132 std::cout << " =========================================================================================================================" << std::endl;
131- std::cout << " Running on " << omp_get_max_threads () << " threads" << std::endl;
133+ std::cout << " Running on " << nthreads << ((nthreads > 1 ) ? " threads" : " thread " ) << std::endl;
132134
133135 for (uint32_t i=1 ; i<1000000 ; i *= 2 )
134136 RUN_KERNEL (memory_load_1KiB, memory_load_count/i, i*1024 , 128 );
0 commit comments