@@ -46,13 +46,27 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
46
})
47
47
#endif
48
48
49
- #define A_ELEMENT_K (m , offset_k ) A[(i + (m)) + (k + offset_k) * lda]
49
+ #define RESET_A_POINTER () a_offset = A;
50
+
51
+ #define CREATE_A_POINTER (m , scale ) FLOAT* a_offset##m = a_offset + scale;
52
+ #define UPDATE_A_POINTER (scale ) a_offset = a_offset + scale;
53
+ #define A_ELEMENT_K (m , offset_k ) *(a_offset##m + (k + offset_k) * lda)
50
54
#define A_ELEMENT (m ) A_ELEMENT_K(m, 0)
51
55
52
- #define B_ELEMENT_K (n , offset_k ) B[(k + offset_k) + (j + (n)) * ldb]
56
+ #define RESET_B_POINTER () b_offset = B;
57
+
58
+ #define CREATE_B_POINTER (n , scale ) FLOAT* b_offset##n = b_offset + scale * ldb;
59
+ #define UPDATE_B_POINTER (scale ) b_offset = b_offset + scale * ldb;
60
+ #define B_ELEMENT_K (n , offset_k ) *(b_offset##n + (k + offset_k))
53
61
#define B_ELEMENT (n ) B_ELEMENT_K(n, 0)
54
62
55
- #define C_ELEMENT (m , n ) C[(i + (m)) + (j + (n)) * ldc]
63
+ #define CREATE_C_POINTER (n , scale ) FLOAT* c_offset##n = c_offset + scale * ldc;
64
+ #define INCR_C_POINTER (m , incr ) // c_offset ## m += incr;
65
+ #define UPDATE_C_POINTER (scale ) c_offset = c_offset + scale * ldc;
66
+ #define C_ELEMENT (m , n ) *(c_offset##n + ((m * v_size) + i))
67
+
68
+ // #undef C_ELEMENT
69
+ // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc]
56
70
57
71
#define PACK_ELEMENT_K (n , offset_k ) packed_b[(k + offset_k) * 4 + n]
58
72
#define PACK_ELEMENT (n ) PACK_ELEMENT_K(n, 0)
@@ -112,8 +126,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
112
126
#define BROADCAST_LOAD_B (n , offset_k ) \
113
127
svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k));
114
128
#define VECTOR_LOAD_A (pg , m , offset_k ) \
115
- svfloat64_t a##s##m##_k##offset_k = \
116
- svld1(pg, &A_ELEMENT_K(v_size * m, offset_k));
129
+ svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k));
117
130
#define QUADWORD_LOAD_B (n , offset_k ) \
118
131
svfloat64_t b##s##n##_k##offset_k = \
119
132
svld1rq(pg_true, &B_ELEMENT_K(n, offset_k));
@@ -140,26 +153,23 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
140
153
#ifdef B0
141
154
#define VECTOR_STORE (pg , m , n ) \
142
155
result##m##n = svmul_m(pg, result##m##n, alpha_vec); \
143
- svst1(pg, &C_ELEMENT(v_size* m, n), result##m##n);
156
+ svst1(pg, &C_ELEMENT(m, n), result##m##n);
144
157
#define SCATTER_STORE (pg , m , n ) \
145
158
result##m##n = svmul_m(pg, result##m##n, alpha_vec); \
146
- svst1_scatter_index( \
147
- pg, &C_ELEMENT(v_size* m, n), svindex_u64(0LL, ldc), result##m##n);
159
+ svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n);
148
160
#else
149
161
#define VECTOR_STORE (pg , m , n ) \
150
162
result##m##n = svmul_m(pg, result##m##n, alpha_vec); \
151
163
result##m##n = \
152
- svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(v_size * m, n)), beta_vec); \
153
- svst1(pg, &C_ELEMENT(v_size* m, n), result##m##n);
164
+ svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \
165
+ svst1(pg, &C_ELEMENT(m, n), result##m##n);
154
166
#define SCATTER_STORE (pg , m , n ) \
155
167
result##m##n = svmul_m(pg, result##m##n, alpha_vec); \
156
- result##m##n = svmla_m( \
157
- pg, \
158
- result##m##n, \
159
- svld1_gather_index(pg, &C_ELEMENT(v_size * m, n), svindex_u64(0LL, ldc)), \
160
- beta_vec); \
161
- svst1_scatter_index( \
162
- pg, &C_ELEMENT(v_size* m, n), svindex_u64(0LL, ldc), result##m##n);
168
+ result##m##n = svmla_m(pg, \
169
+ result##m##n, \
170
+ svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \
171
+ beta_vec); \
172
+ svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n);
163
173
#endif
164
174
165
175
#ifndef LIKELY
@@ -169,13 +179,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
169
179
#define LIKELY (x ) (x)
170
180
#endif
171
181
#endif
172
- #ifndef UNLIKELY
173
- #ifdef __GNUC__
174
- #define UNLIKELY (x ) __builtin_expect(!!(x), 0)
175
- #else
176
- #define UNLIKELY (x ) (x)
177
- #endif
178
- #endif
179
182
180
183
#ifdef B0
181
184
int
@@ -223,12 +226,29 @@ CNAME(BLASLONG M,
223
226
FLOAT * packed_b =
224
227
(pack_b ) ? packed_b = (FLOAT * )malloc (K * 4 * sizeof (FLOAT )) : NULL ;
225
228
229
+ FLOAT * b_offset = B ;
230
+ FLOAT * a_offset = A ;
231
+ FLOAT * c_offset = C ;
232
+
226
233
BLASLONG j = 0 ;
227
234
for (; j < n4 ; j += 4 ) {
228
235
236
+ CREATE_C_POINTER (0 , 0 );
237
+ CREATE_C_POINTER (1 , 1 );
238
+ CREATE_C_POINTER (2 , 2 );
239
+ CREATE_C_POINTER (3 , 3 );
240
+ CREATE_B_POINTER (0 , 0 );
241
+ CREATE_B_POINTER (1 , 1 );
242
+ CREATE_B_POINTER (2 , 2 );
243
+ CREATE_B_POINTER (3 , 3 );
244
+
229
245
BLASLONG i = 0 ;
230
246
for (; i < v_m2 ; i += v_size2 ) {
231
247
248
+ CREATE_A_POINTER (0 , 0 );
249
+ CREATE_A_POINTER (1 , v_size );
250
+ UPDATE_A_POINTER (v_size2 );
251
+
232
252
BLASLONG k = 0 ;
233
253
DECLARE_RESULT_VECTOR (0 , 0 );
234
254
DECLARE_RESULT_VECTOR (0 , 1 );
@@ -372,9 +392,16 @@ CNAME(BLASLONG M,
372
392
VECTOR_STORE (pg_true , 1 , 1 );
373
393
VECTOR_STORE (pg_true , 1 , 2 );
374
394
VECTOR_STORE (pg_true , 1 , 3 );
395
+ INCR_C_POINTER (0 , v_size2 );
396
+ INCR_C_POINTER (1 , v_size2 );
397
+ INCR_C_POINTER (2 , v_size2 );
398
+ INCR_C_POINTER (3 , v_size2 );
375
399
}
376
400
for (; i < v_m1 ; i += v_size ) {
377
401
402
+ CREATE_A_POINTER (0 , 0 );
403
+ UPDATE_A_POINTER (v_size );
404
+
378
405
BLASLONG k = 0 ;
379
406
DECLARE_RESULT_VECTOR (0 , 0 );
380
407
DECLARE_RESULT_VECTOR (0 , 1 );
@@ -431,9 +458,15 @@ CNAME(BLASLONG M,
431
458
VECTOR_STORE (pg_true , 0 , 1 );
432
459
VECTOR_STORE (pg_true , 0 , 2 );
433
460
VECTOR_STORE (pg_true , 0 , 3 );
461
+ INCR_C_POINTER (0 , v_size );
462
+ INCR_C_POINTER (1 , v_size );
463
+ INCR_C_POINTER (2 , v_size );
464
+ INCR_C_POINTER (3 , v_size );
434
465
}
435
466
for (; i < M ; i += v_size ) {
436
467
const svbool_t pg_tail = svwhilelt_b64 ((uint64_t )i , (uint64_t )(M ));
468
+ CREATE_A_POINTER (0 , 0 );
469
+ UPDATE_A_POINTER (0 );
437
470
438
471
BLASLONG k = 0 ;
439
472
DECLARE_RESULT_VECTOR (0 , 0 );
@@ -491,13 +524,30 @@ CNAME(BLASLONG M,
491
524
VECTOR_STORE (pg_tail , 0 , 1 );
492
525
VECTOR_STORE (pg_tail , 0 , 2 );
493
526
VECTOR_STORE (pg_tail , 0 , 3 );
527
+ INCR_C_POINTER (0 , 0 );
528
+ INCR_C_POINTER (1 , 0 );
529
+ INCR_C_POINTER (2 , 0 );
530
+ INCR_C_POINTER (3 , 0 );
494
531
}
532
+
533
+ UPDATE_B_POINTER (4 );
534
+ RESET_A_POINTER ();
535
+ UPDATE_C_POINTER (4 );
495
536
}
496
537
for (; j < n2 ; j += 2 ) {
497
538
539
+ CREATE_C_POINTER (0 , 0 );
540
+ CREATE_C_POINTER (1 , 1 );
541
+ CREATE_B_POINTER (0 , 0 );
542
+ CREATE_B_POINTER (1 , 1 );
543
+
498
544
BLASLONG i = 0 ;
499
545
for (; i < v_m2 ; i += v_size2 ) {
500
546
547
+ CREATE_A_POINTER (0 , 0 );
548
+ CREATE_A_POINTER (1 , v_size );
549
+ UPDATE_A_POINTER (v_size2 );
550
+
501
551
BLASLONG k = 0 ;
502
552
DECLARE_RESULT_VECTOR (0 , 0 );
503
553
DECLARE_RESULT_VECTOR (0 , 1 );
@@ -538,9 +588,14 @@ CNAME(BLASLONG M,
538
588
VECTOR_STORE (pg_true , 0 , 1 );
539
589
VECTOR_STORE (pg_true , 1 , 0 );
540
590
VECTOR_STORE (pg_true , 1 , 1 );
591
+ INCR_C_POINTER (0 , v_size2 );
592
+ INCR_C_POINTER (1 , v_size2 );
541
593
}
542
594
for (; i < v_m1 ; i += v_size ) {
543
595
596
+ CREATE_A_POINTER (0 , 0 );
597
+ UPDATE_A_POINTER (v_size );
598
+
544
599
BLASLONG k = 0 ;
545
600
DECLARE_RESULT_VECTOR (0 , 0 );
546
601
DECLARE_RESULT_VECTOR (0 , 1 );
@@ -568,9 +623,13 @@ CNAME(BLASLONG M,
568
623
}
569
624
VECTOR_STORE (pg_true , 0 , 0 );
570
625
VECTOR_STORE (pg_true , 0 , 1 );
626
+ INCR_C_POINTER (0 , v_size );
627
+ INCR_C_POINTER (1 , v_size );
571
628
}
572
629
for (; i < M ; i += v_size ) {
573
630
const svbool_t pg_tail = svwhilelt_b64 ((uint64_t )i , (uint64_t )(M ));
631
+ CREATE_A_POINTER (0 , 0 );
632
+ UPDATE_A_POINTER (0 );
574
633
575
634
BLASLONG k = 0 ;
576
635
DECLARE_RESULT_VECTOR (0 , 0 );
@@ -599,13 +658,26 @@ CNAME(BLASLONG M,
599
658
}
600
659
VECTOR_STORE (pg_tail , 0 , 0 );
601
660
VECTOR_STORE (pg_tail , 0 , 1 );
661
+ INCR_C_POINTER (0 , 0 );
662
+ INCR_C_POINTER (1 , 0 );
602
663
}
664
+
665
+ UPDATE_B_POINTER (2 );
666
+ RESET_A_POINTER ();
667
+ UPDATE_C_POINTER (2 );
603
668
}
604
669
for (; j < N ; j ++ ) {
605
670
671
+ CREATE_C_POINTER (0 , 0 );
672
+ CREATE_B_POINTER (0 , 0 );
673
+
606
674
BLASLONG i = 0 ;
607
675
for (; i < v_m2 ; i += v_size2 ) {
608
676
677
+ CREATE_A_POINTER (0 , 0 );
678
+ CREATE_A_POINTER (1 , v_size );
679
+ UPDATE_A_POINTER (v_size2 );
680
+
609
681
BLASLONG k = 0 ;
610
682
DECLARE_RESULT_VECTOR (0 , 0 );
611
683
DECLARE_RESULT_VECTOR (1 , 0 );
@@ -620,9 +692,13 @@ CNAME(BLASLONG M,
620
692
}
621
693
VECTOR_STORE (pg_true , 0 , 0 );
622
694
VECTOR_STORE (pg_true , 1 , 0 );
695
+ INCR_C_POINTER (0 , v_size2 );
623
696
}
624
697
for (; i < v_m1 ; i += v_size ) {
625
698
699
+ CREATE_A_POINTER (0 , 0 );
700
+ UPDATE_A_POINTER (v_size );
701
+
626
702
BLASLONG k = 0 ;
627
703
DECLARE_RESULT_VECTOR (0 , 0 );
628
704
@@ -633,9 +709,12 @@ CNAME(BLASLONG M,
633
709
UPDATE_RESULT_VECTOR (pg_true , 0 , 0 , 0 );
634
710
}
635
711
VECTOR_STORE (pg_true , 0 , 0 );
712
+ INCR_C_POINTER (0 , v_size );
636
713
}
637
714
for (; i < M ; i += v_size ) {
638
715
const svbool_t pg_tail = svwhilelt_b64 ((uint64_t )i , (uint64_t )(M ));
716
+ CREATE_A_POINTER (0 , 0 );
717
+ UPDATE_A_POINTER (0 );
639
718
640
719
BLASLONG k = 0 ;
641
720
DECLARE_RESULT_VECTOR (0 , 0 );
@@ -647,11 +726,16 @@ CNAME(BLASLONG M,
647
726
UPDATE_RESULT_VECTOR (pg_tail , 0 , 0 , 0 );
648
727
}
649
728
VECTOR_STORE (pg_tail , 0 , 0 );
729
+ INCR_C_POINTER (0 , 0 );
650
730
}
731
+
732
+ UPDATE_B_POINTER (1 );
733
+ RESET_A_POINTER ();
734
+ UPDATE_C_POINTER (1 );
651
735
}
652
736
653
737
if (pack_b )
654
738
free (packed_b );
655
739
656
740
return 0 ;
657
- }
741
+ }
0 commit comments