Better messages

joseemoreira · joseemoreira · commit b6aac6362566 · 2025-11-27T22:38:39.000-05:00
diff --git a/Code/matmul/memory.cc b/Code/matmul/memory.cc
@@ -19,6 +19,7 @@
 #define xxpermdi(XT, XA, XB, IM)	asm("xxpermdi " #XT "," #XA "," #XB "," #IM)
 #define xxmfacc(XT)			asm("xxmfacc " #XT)
 #define xvadddp(XT, XA, XB)		asm("xvadddp " #XT "," #XA "," #XB)
+#define nop()				asm("nop ")
 
 void memory_load_1KiB
 (
@@ -27,6 +28,7 @@ void memory_load_1KiB
 )
 {
     asm("mtctr 4");
+    asm("nop");
     asm("LOOP17:");
 
     lxvp( 0+ 0, 3,   0+  0); lxvp( 0+ 2, 3,  32+  0);
@@ -83,8 +85,7 @@ double run_kernel
 #pragma omp parallel for
     for (int j=0; j<nthreads; j++)
     {
-	uint32_t reps = count;
-	for(; reps; reps--)
+	for(uint32_t reps = count; reps; reps--)
 	{
 	    kernel(A[j], r);
 	}
@@ -112,7 +113,7 @@ void run_kernel_and_report
     volatile double GB = (1.0e-09)*N*sizeof(double)*count*omp_get_max_threads();
     elapsed = run_kernel(kernel, count, N, n);
     std::cout << std::setprecision(6);
-    std::cout << "Time to run " << std::setw(30) << name << " (" << std::setw(9) << N << ") " << std::setw(6) << count << " times = " << std::setw(10) << std::fixed << elapsed << " seconds (" << std::setw(10) << std::scientific << GB/elapsed << " GB/s)" << std::endl;
+    std::cout << "Time to run " << std::setw(30) << name << " (" << std::setw(9) << N << " doubles) " << std::setw(9) << count << " times = " << std::setw(10) << std::fixed << elapsed << " seconds (" << std::setw(10) << std::scientific << GB/elapsed << " GB/s)" << std::endl;
 }
 
 #define RUN_KERNEL(kernel, count, N, n) run_kernel_and_report(kernel, count, #kernel, N, n)
@@ -127,8 +128,9 @@ int main
 
     volatile double elapsed;
     
+    int nthreads = omp_get_max_threads();
     std::cout << "=========================================================================================================================" << std::endl;
-    std::cout << "Running on " << omp_get_max_threads() << " threads" << std::endl;
+    std::cout << "Running on " << nthreads << ((nthreads > 1) ? " threads" : " thread") << std::endl;
 
     for (uint32_t i=1; i<1000000; i *= 2) 
 	RUN_KERNEL(memory_load_1KiB, memory_load_count/i, i*1024, 128);

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@`
`19`	`19`	`#define xxpermdi(XT, XA, XB, IM) asm("xxpermdi " #XT "," #XA "," #XB "," #IM)`
`20`	`20`	`#define xxmfacc(XT) asm("xxmfacc " #XT)`
`21`	`21`	`#define xvadddp(XT, XA, XB) asm("xvadddp " #XT "," #XA "," #XB)`
	`22`	`+#define nop() asm("nop ")`
`22`	`23`
`23`	`24`	`void memory_load_1KiB`
`24`	`25`	`(`
`@@ -27,6 +28,7 @@ void memory_load_1KiB`
`27`	`28`	`)`
`28`	`29`	`{`
`29`	`30`	`asm("mtctr 4");`
	`31`	`+ asm("nop");`
`30`	`32`	`asm("LOOP17:");`
`31`	`33`
`32`	`34`	`lxvp( 0+ 0, 3, 0+ 0); lxvp( 0+ 2, 3, 32+ 0);`
`@@ -83,8 +85,7 @@ double run_kernel`
`83`	`85`	`#pragma omp parallel for`
`84`	`86`	`for (int j=0; j<nthreads; j++)`
`85`	`87`	`{`
`86`		`- uint32_t reps = count;`
`87`		`- for(; reps; reps--)`
	`88`	`+ for(uint32_t reps = count; reps; reps--)`
`88`	`89`	`{`
`89`	`90`	`kernel(A[j], r);`
`90`	`91`	`}`
`@@ -112,7 +113,7 @@ void run_kernel_and_report`
`112`	`113`	`volatile double GB = (1.0e-09)Nsizeof(double)countomp_get_max_threads();`
`113`	`114`	`elapsed = run_kernel(kernel, count, N, n);`
`114`	`115`	`std::cout << std::setprecision(6);`
`115`		`- std::cout << "Time to run " << std::setw(30) << name << " (" << std::setw(9) << N << ") " << std::setw(6) << count << " times = " << std::setw(10) << std::fixed << elapsed << " seconds (" << std::setw(10) << std::scientific << GB/elapsed << " GB/s)" << std::endl;`
	`116`	`+ std::cout << "Time to run " << std::setw(30) << name << " (" << std::setw(9) << N << " doubles) " << std::setw(9) << count << " times = " << std::setw(10) << std::fixed << elapsed << " seconds (" << std::setw(10) << std::scientific << GB/elapsed << " GB/s)" << std::endl;`
`116`	`117`	`}`
`117`	`118`
`118`	`119`	`#define RUN_KERNEL(kernel, count, N, n) run_kernel_and_report(kernel, count, #kernel, N, n)`
`@@ -127,8 +128,9 @@ int main`
`127`	`128`
`128`	`129`	`volatile double elapsed;`
`129`	`130`
	`131`	`+ int nthreads = omp_get_max_threads();`
`130`	`132`	`std::cout << "=========================================================================================================================" << std::endl;`
`131`		`- std::cout << "Running on " << omp_get_max_threads() << " threads" << std::endl;`
	`133`	`+ std::cout << "Running on " << nthreads << ((nthreads > 1) ? " threads" : " thread") << std::endl;`
`132`	`134`
`133`	`135`	`for (uint32_t i=1; i<1000000; i *= 2)`
`134`	`136`	`RUN_KERNEL(memory_load_1KiB, memory_load_count/i, i*1024, 128);`