@@ -211,6 +211,7 @@ CNAME(BLASLONG M,
211
211
const BLASLONG v_m1 = M & - v_size ;
212
212
const BLASLONG n4 = N & -4 ;
213
213
const BLASLONG n2 = N & -2 ;
214
+ const BLASLONG n8 = N & -8 ;
214
215
215
216
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0 ;
216
217
FLOAT * packed_a =
@@ -229,28 +230,37 @@ CNAME(BLASLONG M,
229
230
CREATE_A_POINTER (1 , v_size );
230
231
231
232
BLASLONG j = 0 ;
232
- for (; j < n4 ; j += 4 ) {
233
-
233
+ for (; j < n8 ; j += 8 ) {
234
234
CREATE_B_POINTER (0 , 0 );
235
235
CREATE_B_POINTER (1 , 1 );
236
236
CREATE_B_POINTER (2 , 2 );
237
237
CREATE_B_POINTER (3 , 3 );
238
- UPDATE_B_POINTER (4 );
238
+ CREATE_B_POINTER (4 , 4 );
239
+ CREATE_B_POINTER (5 , 5 );
240
+ CREATE_B_POINTER (6 , 6 );
241
+ CREATE_B_POINTER (7 , 7 );
242
+ UPDATE_B_POINTER (8 );
239
243
240
244
BLASLONG k = 0 ;
241
245
DECLARE_RESULT_VECTOR (0 , 0 );
242
246
DECLARE_RESULT_VECTOR (0 , 1 );
243
247
DECLARE_RESULT_VECTOR (0 , 2 );
244
248
DECLARE_RESULT_VECTOR (0 , 3 );
249
+ DECLARE_RESULT_VECTOR (0 , 4 );
250
+ DECLARE_RESULT_VECTOR (0 , 5 );
251
+ DECLARE_RESULT_VECTOR (0 , 6 );
252
+ DECLARE_RESULT_VECTOR (0 , 7 );
245
253
DECLARE_RESULT_VECTOR (1 , 0 );
246
254
DECLARE_RESULT_VECTOR (1 , 1 );
247
255
DECLARE_RESULT_VECTOR (1 , 2 );
248
256
DECLARE_RESULT_VECTOR (1 , 3 );
249
-
257
+ DECLARE_RESULT_VECTOR (1 , 4 );
258
+ DECLARE_RESULT_VECTOR (1 , 5 );
259
+ DECLARE_RESULT_VECTOR (1 , 6 );
260
+ DECLARE_RESULT_VECTOR (1 , 7 );
250
261
if (LIKELY (packed_a != NULL )) {
251
262
if (j == 0 ) {
252
263
for (; k < K ; k ++ ) {
253
-
254
264
BROADCAST_LOAD_B (0 , 0 );
255
265
GATHER_LOAD_A (pg_true , 0 , 0 );
256
266
VECTOR_PACK_A (0 , 0 );
@@ -267,10 +277,21 @@ CNAME(BLASLONG M,
267
277
BROADCAST_LOAD_B (3 , 0 );
268
278
UPDATE_RESULT_VECTOR (pg_true , 0 , 3 , 0 );
269
279
UPDATE_RESULT_VECTOR (pg_true , 1 , 3 , 0 );
280
+ BROADCAST_LOAD_B (4 , 0 );
281
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 4 , 0 );
282
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 4 , 0 );
283
+ BROADCAST_LOAD_B (5 , 0 );
284
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 5 , 0 );
285
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 5 , 0 );
286
+ BROADCAST_LOAD_B (6 , 0 );
287
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 6 , 0 );
288
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 6 , 0 );
289
+ BROADCAST_LOAD_B (7 , 0 );
290
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 7 , 0 );
291
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 7 , 0 );
270
292
}
271
293
} else {
272
294
for (; k < K ; k ++ ) {
273
-
274
295
BROADCAST_LOAD_B (0 , 0 );
275
296
UNPACK_VECTOR_A (0 , 0 );
276
297
UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
@@ -285,7 +306,104 @@ CNAME(BLASLONG M,
285
306
BROADCAST_LOAD_B (3 , 0 );
286
307
UPDATE_RESULT_VECTOR (pg_true , 0 , 3 , 0 );
287
308
UPDATE_RESULT_VECTOR (pg_true , 1 , 3 , 0 );
309
+ BROADCAST_LOAD_B (4 , 0 );
310
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 4 , 0 );
311
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 4 , 0 );
312
+ BROADCAST_LOAD_B (5 , 0 );
313
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 5 , 0 );
314
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 5 , 0 );
315
+ BROADCAST_LOAD_B (6 , 0 );
316
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 6 , 0 );
317
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 6 , 0 );
318
+ BROADCAST_LOAD_B (7 , 0 );
319
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 7 , 0 );
320
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 7 , 0 );
288
321
}
322
+ }
323
+ } else {
324
+ for (; k < K ; k ++ ) {
325
+ BROADCAST_LOAD_B (0 , 0 );
326
+ GATHER_LOAD_A (pg_true , 0 , 0 );
327
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
328
+ BROADCAST_LOAD_B (1 , 0 );
329
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 1 , 0 );
330
+ GATHER_LOAD_A (pg_true , 1 , 0 );
331
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 0 , 0 );
332
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 1 , 0 );
333
+ BROADCAST_LOAD_B (2 , 0 );
334
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 2 , 0 );
335
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 2 , 0 );
336
+ BROADCAST_LOAD_B (3 , 0 );
337
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 3 , 0 );
338
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 3 , 0 );
339
+ BROADCAST_LOAD_B (4 , 0 );
340
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 4 , 0 );
341
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 4 , 0 );
342
+ BROADCAST_LOAD_B (5 , 0 );
343
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 5 , 0 );
344
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 5 , 0 );
345
+ BROADCAST_LOAD_B (6 , 0 );
346
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 6 , 0 );
347
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 6 , 0 );
348
+ BROADCAST_LOAD_B (7 , 0 );
349
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 7 , 0 );
350
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 7 , 0 );
351
+ }
352
+ }
353
+ VECTOR_STORE (pg_true , 0 , 0 );
354
+ VECTOR_STORE (pg_true , 0 , 1 );
355
+ VECTOR_STORE (pg_true , 0 , 2 );
356
+ VECTOR_STORE (pg_true , 0 , 3 );
357
+ VECTOR_STORE (pg_true , 0 , 4 );
358
+ VECTOR_STORE (pg_true , 0 , 5 );
359
+ VECTOR_STORE (pg_true , 0 , 6 );
360
+ VECTOR_STORE (pg_true , 0 , 7 );
361
+ VECTOR_STORE (pg_true , 1 , 0 );
362
+ VECTOR_STORE (pg_true , 1 , 1 );
363
+ VECTOR_STORE (pg_true , 1 , 2 );
364
+ VECTOR_STORE (pg_true , 1 , 3 );
365
+ VECTOR_STORE (pg_true , 1 , 4 );
366
+ VECTOR_STORE (pg_true , 1 , 5 );
367
+ VECTOR_STORE (pg_true , 1 , 6 );
368
+ VECTOR_STORE (pg_true , 1 , 7 );
369
+ INCR_C_POINTER (0 , 8 );
370
+ INCR_C_POINTER (1 , 8 );
371
+ }
372
+ for (; j < n4 ; j += 4 ) {
373
+
374
+ CREATE_B_POINTER (0 , 0 );
375
+ CREATE_B_POINTER (1 , 1 );
376
+ CREATE_B_POINTER (2 , 2 );
377
+ CREATE_B_POINTER (3 , 3 );
378
+ UPDATE_B_POINTER (4 );
379
+
380
+ BLASLONG k = 0 ;
381
+ DECLARE_RESULT_VECTOR (0 , 0 );
382
+ DECLARE_RESULT_VECTOR (0 , 1 );
383
+ DECLARE_RESULT_VECTOR (0 , 2 );
384
+ DECLARE_RESULT_VECTOR (0 , 3 );
385
+ DECLARE_RESULT_VECTOR (1 , 0 );
386
+ DECLARE_RESULT_VECTOR (1 , 1 );
387
+ DECLARE_RESULT_VECTOR (1 , 2 );
388
+ DECLARE_RESULT_VECTOR (1 , 3 );
389
+
390
+ if (LIKELY (packed_a != NULL )) {
391
+ for (; k < K ; k ++ ) {
392
+
393
+ BROADCAST_LOAD_B (0 , 0 );
394
+ UNPACK_VECTOR_A (0 , 0 );
395
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
396
+ BROADCAST_LOAD_B (1 , 0 );
397
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 1 , 0 );
398
+ UNPACK_VECTOR_A (1 , 0 );
399
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 0 , 0 );
400
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 1 , 0 );
401
+ BROADCAST_LOAD_B (2 , 0 );
402
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 2 , 0 );
403
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 2 , 0 );
404
+ BROADCAST_LOAD_B (3 , 0 );
405
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 3 , 0 );
406
+ UPDATE_RESULT_VECTOR (pg_true , 1 , 3 , 0 );
289
407
}
290
408
} else {
291
409
for (; k < K ; k ++ ) {
@@ -405,6 +523,55 @@ CNAME(BLASLONG M,
405
523
CREATE_A_POINTER (0 , 0 );
406
524
407
525
BLASLONG j = 0 ;
526
+ for (; j < n8 ; j += 8 ) {
527
+ CREATE_B_POINTER (0 , 0 );
528
+ CREATE_B_POINTER (1 , 1 );
529
+ CREATE_B_POINTER (2 , 2 );
530
+ CREATE_B_POINTER (3 , 3 );
531
+ CREATE_B_POINTER (4 , 4 );
532
+ CREATE_B_POINTER (5 , 5 );
533
+ CREATE_B_POINTER (6 , 6 );
534
+ CREATE_B_POINTER (7 , 7 );
535
+ UPDATE_B_POINTER (8 );
536
+
537
+ BLASLONG k = 0 ;
538
+ DECLARE_RESULT_VECTOR (0 , 0 );
539
+ DECLARE_RESULT_VECTOR (0 , 1 );
540
+ DECLARE_RESULT_VECTOR (0 , 2 );
541
+ DECLARE_RESULT_VECTOR (0 , 3 );
542
+ DECLARE_RESULT_VECTOR (0 , 4 );
543
+ DECLARE_RESULT_VECTOR (0 , 5 );
544
+ DECLARE_RESULT_VECTOR (0 , 6 );
545
+ DECLARE_RESULT_VECTOR (0 , 7 );
546
+ for (; k < K ; k ++ ) {
547
+ BROADCAST_LOAD_B (0 , 0 );
548
+ GATHER_LOAD_A (pg_true , 0 , 0 );
549
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
550
+ BROADCAST_LOAD_B (1 , 0 );
551
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 1 , 0 );
552
+ BROADCAST_LOAD_B (2 , 0 );
553
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 2 , 0 );
554
+ BROADCAST_LOAD_B (3 , 0 );
555
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 3 , 0 );
556
+ BROADCAST_LOAD_B (4 , 0 );
557
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 4 , 0 );
558
+ BROADCAST_LOAD_B (5 , 0 );
559
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 5 , 0 );
560
+ BROADCAST_LOAD_B (6 , 0 );
561
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 6 , 0 );
562
+ BROADCAST_LOAD_B (7 , 0 );
563
+ UPDATE_RESULT_VECTOR (pg_true , 0 , 7 , 0 );
564
+ }
565
+ VECTOR_STORE (pg_true , 0 , 0 );
566
+ VECTOR_STORE (pg_true , 0 , 1 );
567
+ VECTOR_STORE (pg_true , 0 , 2 );
568
+ VECTOR_STORE (pg_true , 0 , 3 );
569
+ VECTOR_STORE (pg_true , 0 , 4 );
570
+ VECTOR_STORE (pg_true , 0 , 5 );
571
+ VECTOR_STORE (pg_true , 0 , 6 );
572
+ VECTOR_STORE (pg_true , 0 , 7 );
573
+ INCR_C_POINTER (0 , 8 );
574
+ }
408
575
for (; j < n4 ; j += 4 ) {
409
576
410
577
CREATE_B_POINTER (0 , 0 );
@@ -487,6 +654,55 @@ CNAME(BLASLONG M,
487
654
CREATE_A_POINTER (0 , 0 );
488
655
489
656
BLASLONG j = 0 ;
657
+ for (; j < n8 ; j += 8 ) {
658
+ CREATE_B_POINTER (0 , 0 );
659
+ CREATE_B_POINTER (1 , 1 );
660
+ CREATE_B_POINTER (2 , 2 );
661
+ CREATE_B_POINTER (3 , 3 );
662
+ CREATE_B_POINTER (4 , 4 );
663
+ CREATE_B_POINTER (5 , 5 );
664
+ CREATE_B_POINTER (6 , 6 );
665
+ CREATE_B_POINTER (7 , 7 );
666
+ UPDATE_B_POINTER (8 );
667
+
668
+ BLASLONG k = 0 ;
669
+ DECLARE_RESULT_VECTOR (0 , 0 );
670
+ DECLARE_RESULT_VECTOR (0 , 1 );
671
+ DECLARE_RESULT_VECTOR (0 , 2 );
672
+ DECLARE_RESULT_VECTOR (0 , 3 );
673
+ DECLARE_RESULT_VECTOR (0 , 4 );
674
+ DECLARE_RESULT_VECTOR (0 , 5 );
675
+ DECLARE_RESULT_VECTOR (0 , 6 );
676
+ DECLARE_RESULT_VECTOR (0 , 7 );
677
+ for (; k < K ; k ++ ) {
678
+ BROADCAST_LOAD_B (0 , 0 );
679
+ GATHER_LOAD_A (pg_tail , 0 , 0 );
680
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 0 , 0 );
681
+ BROADCAST_LOAD_B (1 , 0 );
682
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 1 , 0 );
683
+ BROADCAST_LOAD_B (2 , 0 );
684
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 2 , 0 );
685
+ BROADCAST_LOAD_B (3 , 0 );
686
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 3 , 0 );
687
+ BROADCAST_LOAD_B (4 , 0 );
688
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 4 , 0 );
689
+ BROADCAST_LOAD_B (5 , 0 );
690
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 5 , 0 );
691
+ BROADCAST_LOAD_B (6 , 0 );
692
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 6 , 0 );
693
+ BROADCAST_LOAD_B (7 , 0 );
694
+ UPDATE_RESULT_VECTOR (pg_tail , 0 , 7 , 0 );
695
+ }
696
+ VECTOR_STORE (pg_tail , 0 , 0 );
697
+ VECTOR_STORE (pg_tail , 0 , 1 );
698
+ VECTOR_STORE (pg_tail , 0 , 2 );
699
+ VECTOR_STORE (pg_tail , 0 , 3 );
700
+ VECTOR_STORE (pg_tail , 0 , 4 );
701
+ VECTOR_STORE (pg_tail , 0 , 5 );
702
+ VECTOR_STORE (pg_tail , 0 , 6 );
703
+ VECTOR_STORE (pg_tail , 0 , 7 );
704
+ INCR_C_POINTER (0 , 8 );
705
+ }
490
706
for (; j < n4 ; j += 4 ) {
491
707
492
708
CREATE_B_POINTER (0 , 0 );
0 commit comments