Skip to content

Commit 46afd64

Browse files
committed
Added pack/unpack of C
1 parent 75ba1b5 commit 46afd64

File tree

1 file changed

+55
-8
lines changed

1 file changed

+55
-8
lines changed

Code/OptionG/matmul.cc

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,14 @@ class RVIME_t : public RV_t
166166

167167
void vle64(u32 vd, double *A)
168168
{
169-
if (debug > 0) { std::cout << "Loading VR[" << vd << "]" << std::endl; }
169+
if (debug > 2) { std::cout << "Loading VR[" << vd << "]" << std::endl; }
170170
u32 L = VLEN_/SEW_;
171171
for (u32 i=0; i<L; i++) VR[vd].f64[i] = A[i];
172172
}
173173

174174
void vse64(u32 vs, double *A)
175175
{
176-
if (debug > 0) { std::cout << "Storing VR[" << vs << "]" << std::endl; }
176+
if (debug > 2) { std::cout << "Storing VR[" << vs << "]" << std::endl; }
177177
u32 L = VLEN_/SEW_;
178178
for (u32 i=0; i<L; i++) A[i] = VR[vs].f64[i];
179179
}
@@ -310,6 +310,18 @@ void microgemm
310310
B += N * RV->lambda(); // Advance the B panel pointer
311311
}
312312
vsetvli(5, 0, 64, 1, true, true);
313+
u32 D = RV->VLENE()/(RV->lambda() * RV->lambda()); // D = # of output registers in a basic vfmmacc instruction
314+
315+
for (u32 r=0; r<16; r++)
316+
{
317+
u32 block = r/D;
318+
u32 col = block/rmul;
319+
u32 row = block%rmul;
320+
u32 displ = (col*rmul*D + row + (r%D)*rmul)*RV->VL();
321+
if (debug > 1) std::cout << "Writing VR[" << r+16 << "] to C[" << displ << "]" << std::endl;
322+
vse64.v(r+16, C + displ);
323+
}
324+
/*
313325
u32 D = RV->VLENE()/(RV->lambda() * RV->lambda());
314326
for (u32 j=0; j<cmul; j++)
315327
for (u32 k=0; k<D; k++)
@@ -318,6 +330,7 @@ void microgemm
318330
vse64.v(16 + D*i + D*j*rmul + k, C);
319331
C += RV->VLENE();
320332
}
333+
*/
321334
}
322335

323336
double now()
@@ -342,12 +355,43 @@ void packfp64
342355
u32 mu = sigma*mul;
343356
vsetvli(5, sigma, 64, 1, true, true);
344357
for (u32 k=0; k<K; k+=lambda)
358+
{
359+
for (u32 i=0; i<mul; i++)
360+
for (u32 j=0; j<lambda; j++)
361+
{
362+
vle64.v(0, A + i*sigma + j*mu);
363+
vse64.v(0, P + i*sigma*lambda + j*sigma);
364+
}
365+
A = A + mu*lambda;
366+
P = P + mu*lambda;
367+
}
368+
}
369+
370+
void unpackfp64
371+
(
372+
double *A,
373+
double *P,
374+
u32 sigma,
375+
u32 lambda,
376+
u32 K,
377+
u32 mul
378+
)
379+
{
380+
assert(0 == K % lambda); // For simplicity, K must be a multiple of lambda
381+
382+
u32 mu = sigma*mul;
383+
vsetvli(5, sigma, 64, 1, true, true);
384+
for (u32 k=0; k<K; k+=lambda)
385+
{
345386
for (u32 i=0; i<mul; i++)
346387
for (u32 j=0; j<lambda; j++)
347388
{
348-
vle64.v(0, A + i*sigma + (j+k)*mu);
349-
vse64.v(0, P + i*sigma*lambda + j*sigma + k*mu);
389+
vle64.v(0, P + i*sigma*lambda + j*sigma);
390+
vse64.v(0, A + i*sigma + j*mu);
350391
}
392+
P = P + mu*lambda;
393+
A = A + mu*lambda;
394+
}
351395
}
352396

353397
template<u32 VLEN, u32 lambda>
@@ -403,7 +447,10 @@ bool run_microgemm
403447
packfp64(Cp, C, RV->sigma(), RV->lambda(), N, RV->RMUL());
404448

405449
// Invoke the microgemm kernel
406-
microgemm(K, Ap, Bp, 1.0, C, M, N, rmul, cmul);
450+
microgemm(K, Ap, Bp, 1.0, Cp, M, N, rmul, cmul);
451+
452+
// Unpack the results
453+
unpackfp64(D, Cp, RV->sigma(), RV->lambda(), N, RV->RMUL());
407454

408455
// Check the result
409456
for (u32 j=0; j<N; j++)
@@ -415,13 +462,13 @@ bool run_microgemm
415462
if (debug > 1)
416463
{
417464
if ((2 == i) && (0 == j))
418-
std::cout << "A[" << i << ", " << k << "] = " << A[i+k*M] << std::endl;
465+
std::cout << "A[" << i << ", " << k << "] = " << A[i+k*M] << ", B[" << k << ", " << j << "] = " << B[j+k*N] << std::endl;
419466
}
420467
S += A[i+k*M]*B[j+k*N];
421468
}
422-
if (S != C[i+j*M])
469+
if (S != D[i+j*M])
423470
{
424-
std::cout << "Error for C[" << i << "," << j << "] = " << C[i+j*M] << " != " << S << std::endl;
471+
std::cout << "Error for D[" << i << "," << j << "] = " << D[i+j*M] << " != " << S << std::endl;
425472
exit(-1);
426473
}
427474
}

0 commit comments

Comments
 (0)