@@ -59,6 +59,83 @@ double fdotp_v64b(const double *a, const double *b, unsigned int avl) {
5959 return red ;
6060}
6161
62+ // 64-bit dot-product: a * b
63+ // m8 allows only for partial register re-allocation with factor-2 unrolling
64+ double fdotp_v64b_m8_unrl (const double * a , const double * b ,
65+ unsigned int avl ) {
66+ const unsigned int orig_avl = avl ;
67+ unsigned int vl ;
68+
69+ double red ;
70+
71+ // Stripmine and accumulate a partial reduced vector
72+ do {
73+ // Set the vl
74+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
75+
76+ // Load chunk a and b
77+ asm volatile ("vle64.v v8, (%0)" ::"r" (a ));
78+ asm volatile ("vle64.v v16, (%0)" ::"r" (b ));
79+
80+ // Multiply and accumulate
81+ if (avl == orig_avl ) {
82+ asm volatile ("vfmul.vv v24, v8, v16" );
83+ } else {
84+ asm volatile ("vfmacc.vv v24, v8, v16" );
85+ }
86+
87+ // Bump pointers
88+ a += vl ;
89+ b += vl ;
90+ avl -= vl ;
91+
92+ if (avl <= 0 )
93+ break ;
94+
95+ // Set the vl
96+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
97+
98+ // Load chunk a and b
99+ asm volatile ("vle64.v v0, (%0)" ::"r" (a ));
100+ asm volatile ("vle64.v v8, (%0)" ::"r" (b ));
101+
102+ // Multiply and accumulate
103+ asm volatile ("vfmacc.vv v24, v0, v8" );
104+
105+ // Bump pointers
106+ a += vl ;
107+ b += vl ;
108+ avl -= vl ;
109+
110+ if (avl <= 0 )
111+ break ;
112+
113+ // Set the vl
114+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
115+
116+ // Load chunk a and b
117+ asm volatile ("vle64.v v16, (%0)" ::"r" (a ));
118+ asm volatile ("vle64.v v0, (%0)" ::"r" (b ));
119+
120+ // Multiply and accumulate
121+ asm volatile ("vfmacc.vv v24, v0, v16" );
122+
123+ // Bump pointers
124+ a += vl ;
125+ b += vl ;
126+ avl -= vl ;
127+ } while (avl > 0 );
128+
129+ // Clean the accumulator
130+ asm volatile ("vmv.s.x v0, zero" );
131+
132+ // Reduce and return
133+ asm volatile ("vfredusum.vs v0, v24, v0" );
134+ asm volatile ("vfmv.f.s %0, v0" : "=f" (red ));
135+
136+ return red ;
137+ }
138+
62139// 32-bit dot-product: a * b
63140float fdotp_v32b (const float * a , const float * b , unsigned int avl ) {
64141 const unsigned int orig_avl = avl ;
0 commit comments