@@ -59,6 +59,80 @@ double fdotp_v64b(const double *a, const double *b, unsigned int avl) {
5959 return red ;
6060}
6161
62+ // 64-bit dot-product: a * b
63+ // m8 allows only for partial register re-allocation with factor-2 unrolling
64+ double fdotp_v64b_m8_unrl (const double * a , const double * b , unsigned int avl ) {
65+ const unsigned int orig_avl = avl ;
66+ unsigned int vl ;
67+
68+ double red ;
69+
70+ // Stripmine and accumulate a partial reduced vector
71+ do {
72+ // Set the vl
73+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
74+
75+ // Load chunk a and b
76+ asm volatile ("vle64.v v8, (%0)" ::"r" (a ));
77+ asm volatile ("vle64.v v16, (%0)" ::"r" (b ));
78+
79+ // Multiply and accumulate
80+ if (avl == orig_avl ) {
81+ asm volatile ("vfmul.vv v24, v8, v16" );
82+ } else {
83+ asm volatile ("vfmacc.vv v24, v8, v16" );
84+ }
85+
86+ // Bump pointers
87+ a += vl ;
88+ b += vl ;
89+ avl -= vl ;
90+
91+ if (avl <= 0 ) break ;
92+
93+ // Set the vl
94+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
95+
96+ // Load chunk a and b
97+ asm volatile ("vle64.v v0, (%0)" ::"r" (a ));
98+ asm volatile ("vle64.v v8, (%0)" ::"r" (b ));
99+
100+ // Multiply and accumulate
101+ asm volatile ("vfmacc.vv v24, v0, v8" );
102+
103+ // Bump pointers
104+ a += vl ;
105+ b += vl ;
106+ avl -= vl ;
107+
108+ if (avl <= 0 ) break ;
109+
110+ // Set the vl
111+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
112+
113+ // Load chunk a and b
114+ asm volatile ("vle64.v v16, (%0)" ::"r" (a ));
115+ asm volatile ("vle64.v v0, (%0)" ::"r" (b ));
116+
117+ // Multiply and accumulate
118+ asm volatile ("vfmacc.vv v24, v0, v16" );
119+
120+ // Bump pointers
121+ a += vl ;
122+ b += vl ;
123+ avl -= vl ;
124+ } while (avl > 0 );
125+
126+ // Clean the accumulator
127+ asm volatile ("vmv.s.x v0, zero" );
128+
129+ // Reduce and return
130+ asm volatile ("vfredusum.vs v0, v24, v0" );
131+ asm volatile ("vfmv.f.s %0, v0" : "=f" (red ));
132+
133+ return red ;
134+ }
135+
62136// 32-bit dot-product: a * b
63137float fdotp_v32b (const float * a , const float * b , unsigned int avl ) {
64138 const unsigned int orig_avl = avl ;
0 commit comments