@@ -265,43 +265,7 @@ CNAME(BLASLONG M,
265
265
266
266
if (LIKELY (packed_a != NULL )) {
267
267
if (j == 0 ) {
268
- for (; k < k2 ; k += 2 ) {
269
-
270
- VECTOR_LOAD_B_K2 (0 , 0 );
271
- VECTOR_LOAD_B_K2 (1 , 0 );
272
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
273
- SCALE_B2_K2 (0 , 0 , 1 );
274
- GATHER_LOAD_A (pg_true , 0 , 0 );
275
- VECTOR_PACK_A (0 , 0 );
276
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
277
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
278
- GATHER_LOAD_A (pg_true , 0 , 1 );
279
- VECTOR_PACK_A (0 , 1 );
280
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
281
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
282
- VECTOR_LOAD_B_K2 (2 , 0 );
283
- VECTOR_LOAD_B_K2 (3 , 0 );
284
- TRANSPOSE_B2_K2 (2 , 3 , 0 , 1 );
285
- SCALE_B2_K2 (2 , 0 , 1 );
286
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 0 );
287
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 0 );
288
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 1 );
289
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 1 );
290
- GATHER_LOAD_A (pg_true , 1 , 0 );
291
- VECTOR_PACK_A (1 , 0 );
292
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
293
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
294
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 2 , 0 , 0 );
295
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 2 , 1 , 0 );
296
- GATHER_LOAD_A (pg_true , 1 , 1 );
297
- VECTOR_PACK_A (1 , 1 );
298
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 1 );
299
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 1 );
300
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 2 , 0 , 1 );
301
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 2 , 1 , 1 );
302
- }
303
268
for (; k < K ; k ++ ) {
304
-
305
269
BROADCAST_LOAD_B (0 , 0 );
306
270
GATHER_LOAD_A (pg_true , 0 , 0 );
307
271
VECTOR_PACK_A (0 , 0 );
@@ -320,39 +284,7 @@ CNAME(BLASLONG M,
320
284
UPDATE_RESULT_VECTOR (pg_true , 1 , 3 , 0 );
321
285
}
322
286
} else {
323
- for (; k < k2 ; k += 2 ) {
324
-
325
- VECTOR_LOAD_B_K2 (0 , 0 );
326
- VECTOR_LOAD_B_K2 (1 , 0 );
327
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
328
- SCALE_B2_K2 (0 , 0 , 1 );
329
- UNPACK_VECTOR_A (0 , 0 );
330
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
331
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
332
- UNPACK_VECTOR_A (0 , 1 );
333
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
334
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
335
- VECTOR_LOAD_B_K2 (2 , 0 );
336
- VECTOR_LOAD_B_K2 (3 , 0 );
337
- TRANSPOSE_B2_K2 (2 , 3 , 0 , 1 );
338
- SCALE_B2_K2 (2 , 0 , 1 );
339
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 0 );
340
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 0 );
341
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 1 );
342
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 1 );
343
- UNPACK_VECTOR_A (1 , 0 );
344
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
345
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
346
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 2 , 0 , 0 );
347
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 2 , 1 , 0 );
348
- UNPACK_VECTOR_A (1 , 1 );
349
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 1 );
350
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 1 );
351
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 2 , 0 , 1 );
352
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 2 , 1 , 1 );
353
- }
354
287
for (; k < K ; k ++ ) {
355
-
356
288
BROADCAST_LOAD_B (0 , 0 );
357
289
UNPACK_VECTOR_A (0 , 0 );
358
290
UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
@@ -370,37 +302,6 @@ CNAME(BLASLONG M,
370
302
}
371
303
}
372
304
} else {
373
- for (; k < k2 ; k += 2 ) {
374
-
375
- VECTOR_LOAD_B_K2 (0 , 0 );
376
- VECTOR_LOAD_B_K2 (1 , 0 );
377
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
378
- SCALE_B2_K2 (0 , 0 , 1 );
379
- GATHER_LOAD_A (pg_true , 0 , 0 );
380
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
381
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
382
- GATHER_LOAD_A (pg_true , 0 , 1 );
383
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
384
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
385
- VECTOR_LOAD_B_K2 (2 , 0 );
386
- VECTOR_LOAD_B_K2 (3 , 0 );
387
- TRANSPOSE_B2_K2 (2 , 3 , 0 , 1 );
388
- SCALE_B2_K2 (2 , 0 , 1 );
389
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 0 );
390
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 0 );
391
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 1 );
392
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 1 );
393
- GATHER_LOAD_A (pg_true , 1 , 0 );
394
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
395
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
396
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 2 , 0 , 0 );
397
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 2 , 1 , 0 );
398
- GATHER_LOAD_A (pg_true , 1 , 1 );
399
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 1 );
400
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 1 );
401
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 2 , 2 , 0 , 1 );
402
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 3 , 2 , 1 , 1 );
403
- }
404
305
for (; k < K ; k ++ ) {
405
306
406
307
BROADCAST_LOAD_B (0 , 0 );
@@ -443,27 +344,7 @@ CNAME(BLASLONG M,
443
344
DECLARE_RESULT_VECTOR (1 , 1 );
444
345
445
346
if (LIKELY (packed_a != NULL )) {
446
- for (; k < k2 ; k += 2 ) {
447
-
448
- VECTOR_LOAD_B_K2 (0 , 0 );
449
- VECTOR_LOAD_B_K2 (1 , 0 );
450
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
451
- SCALE_B2_K2 (0 , 0 , 1 );
452
- UNPACK_VECTOR_A (0 , 0 );
453
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
454
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
455
- UNPACK_VECTOR_A (0 , 1 );
456
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
457
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
458
- UNPACK_VECTOR_A (1 , 0 );
459
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
460
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
461
- UNPACK_VECTOR_A (1 , 1 );
462
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 1 );
463
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 1 );
464
- }
465
347
for (; k < K ; k ++ ) {
466
-
467
348
BROADCAST_LOAD_B (0 , 0 );
468
349
UNPACK_VECTOR_A (0 , 0 );
469
350
UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
@@ -474,27 +355,7 @@ CNAME(BLASLONG M,
474
355
UPDATE_RESULT_VECTOR (pg_true , 1 , 1 , 0 );
475
356
}
476
357
} else {
477
- for (; k < k2 ; k += 2 ) {
478
-
479
- VECTOR_LOAD_B_K2 (0 , 0 );
480
- VECTOR_LOAD_B_K2 (1 , 0 );
481
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
482
- SCALE_B2_K2 (0 , 0 , 1 );
483
- GATHER_LOAD_A (pg_true , 0 , 0 );
484
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
485
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
486
- GATHER_LOAD_A (pg_true , 0 , 1 );
487
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
488
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
489
- GATHER_LOAD_A (pg_true , 1 , 0 );
490
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 0 );
491
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 0 );
492
- GATHER_LOAD_A (pg_true , 1 , 1 );
493
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 0 , 0 , 0 , 1 );
494
- UPDATE_RESULT_VECTOR_QUADWORD (1 , 1 , 0 , 1 , 1 );
495
- }
496
358
for (; k < K ; k ++ ) {
497
-
498
359
BROADCAST_LOAD_B (0 , 0 );
499
360
GATHER_LOAD_A (pg_true , 0 , 0 );
500
361
UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
@@ -570,27 +431,6 @@ CNAME(BLASLONG M,
570
431
DECLARE_RESULT_VECTOR (0 , 2 );
571
432
DECLARE_RESULT_VECTOR (0 , 3 );
572
433
573
- for (; k < k2 ; k += 2 ) {
574
-
575
- VECTOR_LOAD_B_K2 (0 , 0 );
576
- VECTOR_LOAD_B_K2 (1 , 0 );
577
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
578
- SCALE_B2_K2 (0 , 0 , 1 );
579
- GATHER_LOAD_A (pg_true , 0 , 0 );
580
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
581
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
582
- GATHER_LOAD_A (pg_true , 0 , 1 );
583
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
584
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
585
- VECTOR_LOAD_B_K2 (2 , 0 );
586
- VECTOR_LOAD_B_K2 (3 , 0 );
587
- TRANSPOSE_B2_K2 (2 , 3 , 0 , 1 );
588
- SCALE_B2_K2 (2 , 0 , 1 );
589
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 0 );
590
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 0 );
591
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 1 );
592
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 1 );
593
- }
594
434
for (; k < K ; k ++ ) {
595
435
596
436
BROADCAST_LOAD_B (0 , 0 );
@@ -619,19 +459,6 @@ CNAME(BLASLONG M,
619
459
DECLARE_RESULT_VECTOR (0 , 0 );
620
460
DECLARE_RESULT_VECTOR (0 , 1 );
621
461
622
- for (; k < k2 ; k += 2 ) {
623
-
624
- VECTOR_LOAD_B_K2 (0 , 0 );
625
- VECTOR_LOAD_B_K2 (1 , 0 );
626
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
627
- SCALE_B2_K2 (0 , 0 , 1 );
628
- GATHER_LOAD_A (pg_true , 0 , 0 );
629
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
630
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
631
- GATHER_LOAD_A (pg_true , 0 , 1 );
632
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
633
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
634
- }
635
462
for (; k < K ; k ++ ) {
636
463
637
464
BROADCAST_LOAD_B (0 , 0 );
@@ -686,27 +513,6 @@ CNAME(BLASLONG M,
686
513
DECLARE_RESULT_VECTOR (0 , 2 );
687
514
DECLARE_RESULT_VECTOR (0 , 3 );
688
515
689
- for (; k < k2 ; k += 2 ) {
690
-
691
- VECTOR_LOAD_B_K2 (0 , 0 );
692
- VECTOR_LOAD_B_K2 (1 , 0 );
693
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
694
- SCALE_B2_K2 (0 , 0 , 1 );
695
- GATHER_LOAD_A (pg_tail , 0 , 0 );
696
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
697
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
698
- GATHER_LOAD_A (pg_tail , 0 , 1 );
699
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
700
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
701
- VECTOR_LOAD_B_K2 (2 , 0 );
702
- VECTOR_LOAD_B_K2 (3 , 0 );
703
- TRANSPOSE_B2_K2 (2 , 3 , 0 , 1 );
704
- SCALE_B2_K2 (2 , 0 , 1 );
705
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 0 );
706
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 0 );
707
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 2 , 2 , 0 , 1 );
708
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 3 , 2 , 1 , 1 );
709
- }
710
516
for (; k < K ; k ++ ) {
711
517
712
518
BROADCAST_LOAD_B (0 , 0 );
@@ -735,19 +541,6 @@ CNAME(BLASLONG M,
735
541
DECLARE_RESULT_VECTOR (0 , 0 );
736
542
DECLARE_RESULT_VECTOR (0 , 1 );
737
543
738
- for (; k < k2 ; k += 2 ) {
739
-
740
- VECTOR_LOAD_B_K2 (0 , 0 );
741
- VECTOR_LOAD_B_K2 (1 , 0 );
742
- TRANSPOSE_B2_K2 (0 , 1 , 0 , 1 );
743
- SCALE_B2_K2 (0 , 0 , 1 );
744
- GATHER_LOAD_A (pg_tail , 0 , 0 );
745
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 0 );
746
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 0 );
747
- GATHER_LOAD_A (pg_tail , 0 , 1 );
748
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 0 , 0 , 0 , 1 );
749
- UPDATE_RESULT_VECTOR_QUADWORD (0 , 1 , 0 , 1 , 1 );
750
- }
751
544
for (; k < K ; k ++ ) {
752
545
753
546
BROADCAST_LOAD_B (0 , 0 );
@@ -787,4 +580,4 @@ CNAME(BLASLONG M,
787
580
free (packed_a );
788
581
789
582
return 0 ;
790
- }
583
+ }
0 commit comments