@@ -576,36 +576,36 @@ void microdgemm
576576 double *B0 = B; double *B1 = B0 + RV->VLENE (); double *B2 = B1 + RV->VLENE (); double *B3 = B2 + RV->VLENE ();
577577 for (u32 k=0 ; k<K; k+=lambda_eff)
578578 {
579- std::cout << " k = " << k << std::endl;
579+ if (debug > 1 ) { std::cout << " k = " << k << std::endl; }
580580
581- vmtlfre64.v ( 0 , A0, lambda_eff); { std::cout << " VR[ 0] = " ; RV->printVRf64 ( 0 ); }
582- vmtlfre64.v ( 1 , A1, lambda_eff); { std::cout << " VR[ 1] = " ; RV->printVRf64 ( 1 ); }
583- vmtlfre64.v ( 2 , A2, lambda_eff); { std::cout << " VR[ 2] = " ; RV->printVRf64 ( 2 ); }
584- vmtlfre64.v ( 3 , A3, lambda_eff); { std::cout << " VR[ 3] = " ; RV->printVRf64 ( 3 ); }
585- vmtlfre64.v ( 8 , B0, lambda_eff); { std::cout << " VR[ 8] = " ; RV->printVRf64 ( 8 ); }
586- vmtlfre64.v ( 9 , B1, lambda_eff); { std::cout << " VR[ 9] = " ; RV->printVRf64 ( 9 ); }
587- vmtlfre64.v (10 , B2, lambda_eff); { std::cout << " VR[10] = " ; RV->printVRf64 (10 ); }
588- vmtlfre64.v (11 , B3, lambda_eff); { std::cout << " VR[11] = " ; RV->printVRf64 (11 ); }
581+ vmtlfre64.v ( 0 , A0, lambda_eff); if (debug > 1 ) { std::cout << " VR[ 0] = " ; RV->printVRf64 ( 0 ); }
582+ vmtlfre64.v ( 1 , A1, lambda_eff); if (debug > 1 ) { std::cout << " VR[ 1] = " ; RV->printVRf64 ( 1 ); }
583+ vmtlfre64.v ( 2 , A2, lambda_eff); if (debug > 1 ) { std::cout << " VR[ 2] = " ; RV->printVRf64 ( 2 ); }
584+ vmtlfre64.v ( 3 , A3, lambda_eff); if (debug > 1 ) { std::cout << " VR[ 3] = " ; RV->printVRf64 ( 3 ); }
585+ vmtlfre64.v ( 8 , B0, lambda_eff); if (debug > 1 ) { std::cout << " VR[ 8] = " ; RV->printVRf64 ( 8 ); }
586+ vmtlfre64.v ( 9 , B1, lambda_eff); if (debug > 1 ) { std::cout << " VR[ 9] = " ; RV->printVRf64 ( 9 ); }
587+ vmtlfre64.v (10 , B2, lambda_eff); if (debug > 1 ) { std::cout << " VR[10] = " ; RV->printVRf64 (10 ); }
588+ vmtlfre64.v (11 , B3, lambda_eff); if (debug > 1 ) { std::cout << " VR[11] = " ; RV->printVRf64 (11 ); }
589589
590590 A0 = A0 + INCA ; A1 = A1 + INCA ; A2 = A2 + INCA ; A3 = A3 + INCA;
591591 B0 = B0 + INCB ; B1 = B1 + INCB ; B2 = B2 + INCB ; B3 = B3 + INCB;
592592
593- vfmmacc.v0 (16 , 0 , 8 ); vmrotate.vv ( 8 , 8 ); { std::cout << " VR[16] = " ; RV->printVRf64 (16 ); }
594- vfmmacc.v0 (17 , 0 , 9 ); vmrotate.vv ( 9 , 9 ); { std::cout << " VR[17] = " ; RV->printVRf64 (17 ); }
595- vfmmacc.v0 (18 , 1 , 8 ); vmrotate.vv ( 8 , 8 ); { std::cout << " VR[18] = " ; RV->printVRf64 (18 ); }
596- vfmmacc.v0 (19 , 1 , 9 ); vmrotate.vv ( 9 , 9 ); { std::cout << " VR[19] = " ; RV->printVRf64 (19 ); }
597- vfmmacc.v0 (20 , 0 , 10 ); vmrotate.vv (10 , 10 ); { std::cout << " VR[20] = " ; RV->printVRf64 (20 ); }
598- vfmmacc.v0 (21 , 0 , 11 ); vmrotate.vv (11 , 11 ); { std::cout << " VR[21] = " ; RV->printVRf64 (21 ); }
599- vfmmacc.v0 (22 , 1 , 10 ); vmrotate.vv (10 , 10 ); { std::cout << " VR[22] = " ; RV->printVRf64 (22 ); }
600- vfmmacc.v0 (23 , 1 , 11 ); vmrotate.vv (11 , 11 ); { std::cout << " VR[23] = " ; RV->printVRf64 (23 ); }
601- vfmmacc.v0 (24 , 2 , 8 ); vmrotate.vv ( 8 , 8 ); { std::cout << " VR[24] = " ; RV->printVRf64 (24 ); }
602- vfmmacc.v0 (25 , 2 , 9 ); vmrotate.vv ( 9 , 9 ); { std::cout << " VR[25] = " ; RV->printVRf64 (25 ); }
603- vfmmacc.v0 (26 , 3 , 8 ); vmrotate.vv ( 8 , 8 ); { std::cout << " VR[26] = " ; RV->printVRf64 (26 ); }
604- vfmmacc.v0 (27 , 3 , 9 ); vmrotate.vv ( 9 , 9 ); { std::cout << " VR[27] = " ; RV->printVRf64 (27 ); }
605- vfmmacc.v0 (28 , 2 , 10 ); vmrotate.vv (10 , 10 ); { std::cout << " VR[28] = " ; RV->printVRf64 (28 ); }
606- vfmmacc.v0 (29 , 2 , 11 ); vmrotate.vv (11 , 11 ); { std::cout << " VR[29] = " ; RV->printVRf64 (29 ); }
607- vfmmacc.v0 (30 , 3 , 10 ); vmrotate.vv (10 , 10 ); { std::cout << " VR[30] = " ; RV->printVRf64 (30 ); }
608- vfmmacc.v0 (31 , 3 , 11 ); vmrotate.vv (11 , 11 ); { std::cout << " VR[31] = " ; RV->printVRf64 (31 ); }
593+ vfmmacc.v0 (16 , 0 , 8 ); vmrotate.vv ( 8 , 8 ); if (debug > 1 ) { std::cout << " VR[16] = " ; RV->printVRf64 (16 ); }
594+ vfmmacc.v0 (17 , 0 , 9 ); vmrotate.vv ( 9 , 9 ); if (debug > 1 ) { std::cout << " VR[17] = " ; RV->printVRf64 (17 ); }
595+ vfmmacc.v0 (18 , 1 , 8 ); vmrotate.vv ( 8 , 8 ); if (debug > 1 ) { std::cout << " VR[18] = " ; RV->printVRf64 (18 ); }
596+ vfmmacc.v0 (19 , 1 , 9 ); vmrotate.vv ( 9 , 9 ); if (debug > 1 ) { std::cout << " VR[19] = " ; RV->printVRf64 (19 ); }
597+ vfmmacc.v0 (20 , 0 , 10 ); vmrotate.vv (10 , 10 ); if (debug > 1 ) { std::cout << " VR[20] = " ; RV->printVRf64 (20 ); }
598+ vfmmacc.v0 (21 , 0 , 11 ); vmrotate.vv (11 , 11 ); if (debug > 1 ) { std::cout << " VR[21] = " ; RV->printVRf64 (21 ); }
599+ vfmmacc.v0 (22 , 1 , 10 ); vmrotate.vv (10 , 10 ); if (debug > 1 ) { std::cout << " VR[22] = " ; RV->printVRf64 (22 ); }
600+ vfmmacc.v0 (23 , 1 , 11 ); vmrotate.vv (11 , 11 ); if (debug > 1 ) { std::cout << " VR[23] = " ; RV->printVRf64 (23 ); }
601+ vfmmacc.v0 (24 , 2 , 8 ); vmrotate.vv ( 8 , 8 ); if (debug > 1 ) { std::cout << " VR[24] = " ; RV->printVRf64 (24 ); }
602+ vfmmacc.v0 (25 , 2 , 9 ); vmrotate.vv ( 9 , 9 ); if (debug > 1 ) { std::cout << " VR[25] = " ; RV->printVRf64 (25 ); }
603+ vfmmacc.v0 (26 , 3 , 8 ); vmrotate.vv ( 8 , 8 ); if (debug > 1 ) { std::cout << " VR[26] = " ; RV->printVRf64 (26 ); }
604+ vfmmacc.v0 (27 , 3 , 9 ); vmrotate.vv ( 9 , 9 ); if (debug > 1 ) { std::cout << " VR[27] = " ; RV->printVRf64 (27 ); }
605+ vfmmacc.v0 (28 , 2 , 10 ); vmrotate.vv (10 , 10 ); if (debug > 1 ) { std::cout << " VR[28] = " ; RV->printVRf64 (28 ); }
606+ vfmmacc.v0 (29 , 2 , 11 ); vmrotate.vv (11 , 11 ); if (debug > 1 ) { std::cout << " VR[29] = " ; RV->printVRf64 (29 ); }
607+ vfmmacc.v0 (30 , 3 , 10 ); vmrotate.vv (10 , 10 ); if (debug > 1 ) { std::cout << " VR[30] = " ; RV->printVRf64 (30 ); }
608+ vfmmacc.v0 (31 , 3 , 11 ); vmrotate.vv (11 , 11 ); if (debug > 1 ) { std::cout << " VR[31] = " ; RV->printVRf64 (31 ); }
609609 }
610610
611611 u32 offset[32 ];
@@ -626,7 +626,7 @@ void microdgemm
626626 offset[30 ] = (1 != lmul) ? (offset[29 ] + RV->lambda ()) : (LisSquare () ? offset[28 ] + RV->lambda () * gamma : offset[28 ] + RV->lambda ());
627627 offset[31 ] = ((!LisSquare ()) && (1 == lmul)) ? offset[29 ] + RV->lambda () : offset[30 ] + RV->lambda ();
628628
629- for (u32 i=16 ; i<32 ; i++) std::cout << " offset[" << i << " ] = " << offset[i] << std::endl;
629+ if (debug > 1 ) { for (u32 i=16 ; i<32 ; i++) std::cout << " offset[" << i << " ] = " << offset[i] << std::endl; }
630630
631631 double *D = new double [M*N];
632632 vsetvl (5 , 0 , 64 , 1 , true , true ); // double-precision kernel, set VL to VLENE and LMUL to 1
@@ -655,8 +655,8 @@ void microdgemm
655655 }
656656 for (u32 i=0 ; i<M; i++) for (u32 j=0 ; j<N; j++) C[i*gamma + j] += alpha * S[i*N + j];
657657
658- std::cout << " S = " ; print (M, N, S);
659- std::cout << " D = " ; print (M, N, D);
658+ if (debug > 1 ) { std::cout << " S = " ; print (M, N, S); }
659+ if (debug > 1 ) { std::cout << " D = " ; print (M, N, D); }
660660
661661 for (u32 i=0 ; i<M; i++) for (u32 j=0 ; j<N; j++) if (D[i*N+j] != S[i*N+j]) { std::cout << " Error for D[" << i << " ," << j << " ] = " << D[i*N+j] << " != " << S[i*N+j] << std::endl; exit (-1 ); }
662662 for (u32 i=0 ; i<M; i++) for (u32 j=0 ; j<N; j++) if (E[i*N+j] != C[i*N+j]) { std::cout << " Error for E[" << i << " ," << j << " ] = " << E[i*N+j] << " != " << C[i*N+j] << std::endl; exit (-1 ); }
@@ -757,16 +757,16 @@ bool run_microgemm
757757 u32 M = mu;
758758 u32 N = nu;
759759
760- double alpha = 3.141592654 ; alpha = 1.0 ;
760+ double alpha = 3.141592654 ; // alpha = 1.0;
761761
762762 // Allocate A, B, and C panels
763763 double *A = new double [M*K]; for (u32 i=0 ; i<M*K; i++) A[i] = i; // drand48() - 0.5;
764764 double *B = new double [K*N]; for (u32 i=0 ; i<K*N; i++) B[i] = i; // drand48() - 0.5;
765765 double *C = new double [M*N]; for (u32 i=0 ; i<M*N; i++) C[i] = 0 ; // drand48() - 0.5;
766766 double *D = new double [M*N]; for (u32 i=0 ; i<M*N; i++) D[i] = C[i];
767767
768- std::cout << " A = " ; print (M, K, A);
769- std::cout << " B = " ; print (N, K, B);
768+ if (debug > 1 ) { std::cout << " A = " ; print (M, K, A); }
769+ if (debug > 1 ) { std::cout << " B = " ; print (N, K, B); }
770770
771771 // Allocate the packed panels
772772 double *Ap = new double [M*K];
@@ -780,8 +780,8 @@ bool run_microgemm
780780
781781 for (u32 k=0 ; k<K; k+=lambda_eff)
782782 {
783- std::cout << " Ap[" << k/lambda_eff << " ] = " ; print (M, lambda_eff, Ap+k*mu);
784- std::cout << " Bp[" << k/lambda_eff << " ] = " ; print (N, lambda_eff, Bp+k*nu);
783+ if (debug > 1 ) { std::cout << " Ap[" << k/lambda_eff << " ] = " ; print (M, lambda_eff, Ap+k*mu); }
784+ if (debug > 1 ) { std::cout << " Bp[" << k/lambda_eff << " ] = " ; print (N, lambda_eff, Bp+k*nu); }
785785 }
786786
787787 microdgemm (M, N, K, Ap, Bp, alpha, D, N, LMUL);
@@ -856,9 +856,12 @@ int main
856856 run_microgemm<1024 , 4 >(4 );
857857 run_microgemm<1024 , 4 >(8 );
858858 run_microgemm<1024 , 4 >(16 );
859+ run_microgemm<2048 , 2 >(8 );
860+ run_microgemm<2048 , 2 >(16 );
859861 run_microgemm<2048 , 4 >(8 );
860862 run_microgemm<2048 , 4 >(16 );
861- run_microgemm<4096 , 4 >(8 );
863+ run_microgemm<4096 , 2 >(16 );
864+ run_microgemm<4096 , 4 >(16 );
862865 run_microgemm<4096 , 8 >(16 );
863866
864867 return 0 ;
0 commit comments