Skip to content

Commit 08dfb6c

Browse files
committed
Making the code work even without AVX2/FMA
1 parent 0788606 commit 08dfb6c

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

src/nnet.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,10 @@ static void sparse_gemm_accum16(float *out, const float *weights, int rows, cons
282282
}
283283

284284
#else
285+
286+
#warning Compiling without any vectorization. This code will be very slow
287+
#warning Try adding -mavx2 -mfma
288+
285289
static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
286290
{
287291
int i, j;
@@ -314,6 +318,40 @@ static void gemm_accum16(float *out, const float *weights, int rows, int cols, i
314318
}
315319
}
316320
}
321+
322+
static void sparse_gemm_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
323+
{
324+
int i, j;
325+
for (i=0;i<rows;i+=16)
326+
{
327+
int cols;
328+
cols = *idx++;
329+
for (j=0;j<cols;j++)
330+
{
331+
float * restrict y;
332+
float xj;
333+
xj = x[*idx++];
334+
y = &out[i];
335+
y[0] += w[0]*xj;
336+
y[1] += w[1]*xj;
337+
y[2] += w[2]*xj;
338+
y[3] += w[3]*xj;
339+
y[4] += w[4]*xj;
340+
y[5] += w[5]*xj;
341+
y[6] += w[6]*xj;
342+
y[7] += w[7]*xj;
343+
y[8] += w[8]*xj;
344+
y[9] += w[9]*xj;
345+
y[10] += w[10]*xj;
346+
y[11] += w[11]*xj;
347+
y[12] += w[12]*xj;
348+
y[13] += w[13]*xj;
349+
y[14] += w[14]*xj;
350+
y[15] += w[15]*xj;
351+
w += 16;
352+
}
353+
}
354+
}
317355
#endif
318356

319357
static void gemm_accum(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

0 commit comments

Comments
 (0)