@@ -11,6 +11,8 @@ typedef int64_t s64;
1111
1212const u32 COUNT = 1 ;
1313
14+ const u32 debug = 0 ;
15+
1416class RV_t
1517{
1618 public:
@@ -26,8 +28,7 @@ class RV_t
2628 virtual u32 VLEN () const = 0;
2729 virtual u32 lambda () const = 0;
2830 virtual u32 VLENE () const = 0;
29- virtual u32 mu () const = 0;
30- virtual u32 nu () const = 0;
31+ virtual u32 sigma () const = 0;
3132};
3233
3334template <u32 VLEN_, u32 lambda_>
@@ -52,18 +53,27 @@ class RVIME_t : public RV_t
5253 u32 L = VLEN_/SEW_;
5354 u32 m = L/lambda_;
5455 u32 n = m/lambda_;
55- std::cout << " Executing double-precision vfmmacc : " << " vd = " << vd << " , vs1 = " << vs1 << " , vs2 = " << vs2 << " , L = " << L << " , m = " << m << " , n = " << n << std::endl;
56+ if (debug > 0 )
57+ {
58+ std::cout << " Executing double-precision vfmmacc : " << " vd = " << vd << " , vs1 = " << vs1 << " , vs2 = " << vs2 << " , L = " << L << " , m = " << m << " , n = " << n << std::endl;
59+ }
5660
5761 for (u32 k=0 ; k<lambda_; k++)
5862 {
5963 for (u32 i=0 ; i<m; i++)
6064 for (u32 j=0 ; j<m; j++)
6165 {
62- std::cout << " Computing VR[" << vd + j/lambda_ << " ].f64[" << i + m*(j%lambda_) << " ] += VR[" << vs1 << " ].f64[" << i + m*k << " ] * VR[" << vs2 << " ].f64[" << j + m*k << " ]" << std::endl;
66+ if (debug > 0 )
67+ {
68+ std::cout << " Computing VR[" << vd + j/lambda_ << " ].f64[" << i + m*(j%lambda_) << " ] += VR[" << vs1 << " ].f64[" << i + m*k << " ] * VR[" << vs2 << " ].f64[" << j + m*k << " ]" << std::endl;
69+ }
6370 VR[vd + j/lambda_].f64 [i + m*(j%lambda_)] += VR[vs1].f64 [i + m*k] * VR[vs2].f64 [j + m*k];
64- std::cout << " VR[" << vs1 << " ].f64[" << i + m*k << " ] = " << VR[vs1].f64 [i + m*k] << std::endl;
65- std::cout << " VR[" << vs2 << " ].f64[" << j + m*k << " ] = " << VR[vs2].f64 [j + m*k] << std::endl;
66- std::cout << " VR[" << vd + j/lambda_ << " ].f64[" << i + m*(j%lambda_) << " ] = " << VR[vd + j/lambda_].f64 [i + m*(j%lambda_)] << std::endl;
71+ if (debug > 0 )
72+ {
73+ std::cout << " VR[" << vs1 << " ].f64[" << i + m*k << " ] = " << VR[vs1].f64 [i + m*k] << std::endl;
74+ std::cout << " VR[" << vs2 << " ].f64[" << j + m*k << " ] = " << VR[vs2].f64 [j + m*k] << std::endl;
75+ std::cout << " VR[" << vd + j/lambda_ << " ].f64[" << i + m*(j%lambda_) << " ] = " << VR[vd + j/lambda_].f64 [i + m*(j%lambda_)] << std::endl;
76+ }
6777 }
6878 }
6979 }
@@ -105,13 +115,7 @@ class RVIME_t : public RV_t
105115 return lambda_;
106116 }
107117
108- u32 mu () const
109- {
110- u32 L = VLEN_/SEW_;
111- return L/lambda_;
112- }
113-
114- u32 nu () const
118+ u32 sigma () const
115119 {
116120 u32 L = VLEN_/SEW_;
117121 return L/lambda_;
@@ -129,7 +133,10 @@ class RVIME_t : public RV_t
129133 switch (SEW_)
130134 {
131135 case 64 :
132- std::cout << " Each basic vfmmacc produces " << B << " vector registers of output" << std::endl;
136+ if (debug > 0 )
137+ {
138+ std::cout << " Each basic vfmmacc produces " << B << " vector registers of output" << std::endl;
139+ }
133140 for (u32 i=0 ; i<RMUL (); i++) for (u32 j=0 ; j<CMUL (); j++)
134141 vfmmacc_fp64 (vd + B*i + B*j*RMUL (), vs1 + i, vs2 + j);
135142 break ;
@@ -152,14 +159,14 @@ class RVIME_t : public RV_t
152159
153160 void vle64 (u32 vd, double *A)
154161 {
155- std::cout << " Loading VR[" << vd << " ]" << std::endl;
162+ if (debug > 0 ) { std::cout << " Loading VR[" << vd << " ]" << std::endl; }
156163 u32 L = VLEN_/SEW_;
157164 for (u32 i=0 ; i<L; i++) VR[vd].f64 [i] = A[i];
158165 }
159166
160167 void vse64 (u32 vs, double *A)
161168 {
162- std::cout << " Storing VR[" << vs << " ]" << std::endl;
169+ if (debug > 0 ) { std::cout << " Storing VR[" << vs << " ]" << std::endl; }
163170 u32 L = VLEN_/SEW_;
164171 for (u32 i=0 ; i<L; i++) A[i] = VR[vs].f64 [i];
165172 }
@@ -305,12 +312,15 @@ bool run_microgemm
305312 if (cmul > rmul) rmul = rmul*2 ;
306313 else cmul = cmul * 2 ;
307314 }
308- std::cout << " RMUL = " << rmul << " , CMUL = " << cmul << std::endl;
315+ std::cout << " L = " << std::setw (2 ) << L << " , lambda = " << std::setw (2 ) << RV->lambda () << " , sigma = " << std::setw (2 ) << RV->sigma () << " , RMUL = " << rmul << " , CMUL = " << cmul;
316+
317+ u32 mu = rmul*RV->sigma ();
318+ u32 nu = cmul*RV->sigma ();
309319
310- u32 M = rmul*RV->mu ();
311- u32 N = cmul*RV->nu ();
320+ std::cout << " , microgemm geometry : " << std::setw (2 ) << mu << " x " << std::setw (2 ) << nu << std::endl;
312321
313- std::cout << " Microgemm geometry : " << M << " x " << N << std::endl;
322+ u32 M = mu;
323+ u32 N = nu;
314324
315325 // Allocate A, B, and C panels
316326 double *A = new double [M*K]; for (u32 i=0 ; i<M*K; i++) A[i] = drand48 () - 0.5 ;
@@ -327,8 +337,11 @@ bool run_microgemm
327337 double S = 0 ;
328338 for (u32 k=0 ; k<K; k++)
329339 {
330- if ((2 == i) && (0 == j))
331- std::cout << " A[" << i << " , " << k << " ] = " << A[i+k*M] << std::endl;
340+ if (debug > 1 )
341+ {
342+ if ((2 == i) && (0 == j))
343+ std::cout << " A[" << i << " , " << k << " ] = " << A[i+k*M] << std::endl;
344+ }
332345 S += A[i+k*M]*B[j+k*N];
333346 }
334347 if (S != C[i+j*M])
@@ -347,54 +360,6 @@ bool run_microgemm
347360 return true ;
348361}
349362
350- double run_kernel
351- (
352- void (kernel)(double *, double *, double *, double *),
353- uint32_t count
354- )
355- {
356- const uint32_t N = 1024 *1024 ;
357- double *A = (double *)aligned_alloc (4096 , sizeof (double ) * N); for (uint32_t i=0 ; i<N; i++) A[i] = drand48 () - 0.5 ;
358- double *B = (double *)aligned_alloc (4096 , sizeof (double ) * N); for (uint32_t i=0 ; i<N; i++) B[i] = drand48 () - 0.5 ;
359- double *C = (double *)aligned_alloc (4096 , sizeof (double ) * N); for (uint32_t i=0 ; i<N; i++) C[i] = drand48 () - 0.5 ;
360- double *D = (double *)aligned_alloc (4096 , sizeof (double ) * N); for (uint32_t i=0 ; i<N; i++) D[i] = drand48 () - 0.5 ;
361-
362- volatile double start, finish;
363-
364- start = now ();
365- for (; count; count -= COUNT)
366- {
367- kernel (A,B,C,D);
368- }
369- finish = now ();
370-
371- free (D);
372- free (C);
373- free (B);
374- free (A);
375-
376- return (finish - start);
377- }
378-
379- void run_kernel_and_report
380- (
381- void (kernel)(double *, double *, double *, double *),
382- uint32_t count,
383- const char* name,
384- uint32_t M,
385- uint32_t N,
386- uint32_t K
387- )
388- {
389- volatile double elapsed;
390- volatile double flops = 2.0 *count*M*N*K;
391- elapsed = run_kernel (kernel, count);
392- std::cout << std::setprecision (6 );
393- std::cout << " Time to run " << std::setw (51 ) << name << " " << count << " times = " << std::setw (10 ) << std::fixed << elapsed << " seconds (" << std::setw (10 ) << std::scientific << flops/elapsed << " flops)" << std::endl;
394- }
395-
396- #define RUN_KERNEL (kernel, count, M, N, K ) run_kernel_and_report(kernel, count, #kernel, M, N, K)
397-
398363int main
399364(
400365 int argc,
0 commit comments