@@ -166,14 +166,14 @@ class RVIME_t : public RV_t
166166
167167 void vle64 (u32 vd, double *A)
168168 {
169- if (debug > 0 ) { std::cout << " Loading VR[" << vd << " ]" << std::endl; }
169+ if (debug > 2 ) { std::cout << " Loading VR[" << vd << " ]" << std::endl; }
170170 u32 L = VLEN_/SEW_;
171171 for (u32 i=0 ; i<L; i++) VR[vd].f64 [i] = A[i];
172172 }
173173
174174 void vse64 (u32 vs, double *A)
175175 {
176- if (debug > 0 ) { std::cout << " Storing VR[" << vs << " ]" << std::endl; }
176+ if (debug > 2 ) { std::cout << " Storing VR[" << vs << " ]" << std::endl; }
177177 u32 L = VLEN_/SEW_;
178178 for (u32 i=0 ; i<L; i++) A[i] = VR[vs].f64 [i];
179179 }
@@ -310,6 +310,18 @@ void microgemm
310310 B += N * RV->lambda (); // Advance the B panel pointer
311311 }
312312 vsetvli (5 , 0 , 64 , 1 , true , true );
313+ u32 D = RV->VLENE ()/(RV->lambda () * RV->lambda ()); // D = # of output registers in a basic vfmmacc instruction
314+
315+ for (u32 r=0 ; r<16 ; r++)
316+ {
317+ u32 block = r/D;
318+ u32 col = block/rmul;
319+ u32 row = block%rmul;
320+ u32 displ = (col*rmul*D + row + (r%D)*rmul)*RV->VL ();
321+ if (debug > 1 ) std::cout << " Writing VR[" << r+16 << " ] to C[" << displ << " ]" << std::endl;
322+ vse64.v (r+16 , C + displ);
323+ }
324+ /*
313325 u32 D = RV->VLENE()/(RV->lambda() * RV->lambda());
314326 for (u32 j=0; j<cmul; j++)
315327 for (u32 k=0; k<D; k++)
@@ -318,6 +330,7 @@ void microgemm
318330 vse64.v(16 + D*i + D*j*rmul + k, C);
319331 C += RV->VLENE();
320332 }
333+ */
321334}
322335
323336double now ()
@@ -342,12 +355,43 @@ void packfp64
342355 u32 mu = sigma*mul;
343356 vsetvli (5 , sigma, 64 , 1 , true , true );
344357 for (u32 k=0 ; k<K; k+=lambda)
358+ {
359+ for (u32 i=0 ; i<mul; i++)
360+ for (u32 j=0 ; j<lambda; j++)
361+ {
362+ vle64.v (0 , A + i*sigma + j*mu);
363+ vse64.v (0 , P + i*sigma*lambda + j*sigma);
364+ }
365+ A = A + mu*lambda;
366+ P = P + mu*lambda;
367+ }
368+ }
369+
370+ void unpackfp64
371+ (
372+ double *A,
373+ double *P,
374+ u32 sigma,
375+ u32 lambda,
376+ u32 K,
377+ u32 mul
378+ )
379+ {
380+ assert (0 == K % lambda); // For simplicity, K must be a multiple of lambda
381+
382+ u32 mu = sigma*mul;
383+ vsetvli (5 , sigma, 64 , 1 , true , true );
384+ for (u32 k=0 ; k<K; k+=lambda)
385+ {
345386 for (u32 i=0 ; i<mul; i++)
346387 for (u32 j=0 ; j<lambda; j++)
347388 {
348- vle64.v (0 , A + i*sigma + (j+k)*mu);
349- vse64.v (0 , P + i*sigma*lambda + j*sigma + k* mu);
389+ vle64.v (0 , P + i*sigma*lambda + j*sigma);
390+ vse64.v (0 , A + i*sigma + j*mu);
350391 }
392+ P = P + mu*lambda;
393+ A = A + mu*lambda;
394+ }
351395}
352396
353397template <u32 VLEN, u32 lambda>
@@ -403,7 +447,10 @@ bool run_microgemm
403447 packfp64 (Cp, C, RV->sigma (), RV->lambda (), N, RV->RMUL ());
404448
405449 // Invoke the microgemm kernel
406- microgemm (K, Ap, Bp, 1.0 , C, M, N, rmul, cmul);
450+ microgemm (K, Ap, Bp, 1.0 , Cp, M, N, rmul, cmul);
451+
452+ // Unpack the results
453+ unpackfp64 (D, Cp, RV->sigma (), RV->lambda (), N, RV->RMUL ());
407454
408455 // Check the result
409456 for (u32 j=0 ; j<N; j++)
@@ -415,13 +462,13 @@ bool run_microgemm
415462 if (debug > 1 )
416463 {
417464 if ((2 == i) && (0 == j))
418- std::cout << " A[" << i << " , " << k << " ] = " << A[i+k*M] << std::endl;
465+ std::cout << " A[" << i << " , " << k << " ] = " << A[i+k*M] << " , B[ " << k << " , " << j << " ] = " << B[j+k*N] << std::endl;
419466 }
420467 S += A[i+k*M]*B[j+k*N];
421468 }
422- if (S != C [i+j*M])
469+ if (S != D [i+j*M])
423470 {
424- std::cout << " Error for C [" << i << " ," << j << " ] = " << C [i+j*M] << " != " << S << std::endl;
471+ std::cout << " Error for D [" << i << " ," << j << " ] = " << D [i+j*M] << " != " << S << std::endl;
425472 exit (-1 );
426473 }
427474 }
0 commit comments