@@ -25,9 +25,120 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25
25
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
26
*****************************************************************************/
27
27
28
- #include < common.h>
28
+ #include " common.h"
29
29
#include <arm_neon.h>
30
30
31
+ /*******************************************************************************
32
+ The complex GEMM kernels in OpenBLAS use static configuration of conjugation
33
+ modes via specific macros:
34
+
35
+ MACRO_NAME | conjugation on matrix A | conjugation on matrix B |
36
+ ---------- | ----------------------- | ----------------------- |
37
+ NN/NT/TN/TT | No | No |
38
+ NR/NC/TR/TC | No | Yes |
39
+ RN/RT/CN/CT | Yes | No |
40
+ RR/RC/CR/CC | Yes | Yes |
41
+
42
+ "conjugation on matrix A" means the complex conjugates of elements from
43
+ matrix A are used for matmul (rather than the original elements). "conjugation
44
+ on matrix B" means the complex conjugate of each element from matrix B is taken
45
+ for matrix multiplication, respectively.
46
+
47
+ Complex numbers in arrays or matrices are usually packed together as an
48
+ array of struct (without padding):
49
+ struct complex_number {
50
+ FLOAT real_part;
51
+ FLOAT imag_part;
52
+ };
53
+
54
+ For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF
55
+ DOUBLE, the real part of its Kth complex number can be accessed as
56
+ ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1].
57
+
58
+ This file uses 2 ways to vectorize matrix multiplication of complex numbers:
59
+
60
+ (1) Expanded-form
61
+
62
+ During accumulation along direction K:
63
+
64
+ Σk(a[0][k].real b[k][n].real)
65
+ accumulate Σk(a[0][k].imag b[k][n].real)
66
+ -------------------> .
67
+ | * b[k][n].real .
68
+ | (broadcasted) .
69
+ a[0][k].real Σk(a[v-1][k].real b[k][n].real)
70
+ a[0][k].imag Σk(a[v-1][k].imag b[k][n].real)
71
+ . VECTOR I
72
+ (vec_a) .
73
+ .
74
+ a[v-1][k].real Σk(a[0][k].real b[k][n].imag)
75
+ a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag)
76
+ | .
77
+ | accumulate .
78
+ -------------------> .
79
+ * b[k][n].imag Σk(a[v-1][k].real b[k][n].imag)
80
+ (broadcasted) Σk(a[v-1][k].imag b[k][n].imag)
81
+ VECTOR II
82
+
83
+ After accumulation, prior to storage:
84
+
85
+ -1 -Σk(a[0][k].imag b[k][n].imag)
86
+ 1 Σk(a[0][k].real b[k][n].imag)
87
+ . .
88
+ VECTOR II permute and multiply . to get .
89
+ . .
90
+ -1 -Σk(a[v-1][k].imag b[k][n].imag)
91
+ 1 Σk(a[v-1][k].real b[k][n].imag)
92
+
93
+ then add with VECTOR I to get the result vector of elements of C.
94
+
95
+ 2 vector registers are needed for every v elements of C, with
96
+ v == sizeof(vector) / sizeof(complex)
97
+
98
+ (2) Contracted-form
99
+
100
+ During accumulation along direction K:
101
+
102
+ (the K coordinate is not shown, since the operation is identical for each k)
103
+
104
+ (load vector in mem) (load vector in mem)
105
+ a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i
106
+ | |
107
+ | unzip operation (or VLD2 in arm neon) |
108
+ -----------------------------------------------------
109
+ |
110
+ |
111
+ --------------------------------------------------
112
+ | |
113
+ | |
114
+ v v
115
+ a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag
116
+ | | | |
117
+ | | * b[i].imag(broadcast) | |
118
+ * b[i].real | -----------------------------|---- | * b[i].real
119
+ (broadcast) | | | | (broadcast)
120
+ | ------------------------------ | |
121
+ + | - | * b[i].imag(broadcast) + | + |
122
+ v v v v
123
+ (accumulate) (accumulate)
124
+ c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag
125
+ VECTOR_REAL VECTOR_IMAG
126
+
127
+ After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved)
128
+ then stored to matrix C directly.
129
+
130
+ For 2v elements of C, only 2 vector registers are needed, while
131
+ 4 registers are required for expanded-form.
132
+ (v == sizeof(vector) / sizeof(complex))
133
+
134
+ For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers
135
+ to store elements of C when using expanded-form calculation, where
136
+ the register spilling will occur. So contracted-form operation is
137
+ selected for 4x4 kernel. As for all other combinations of unroll parameters
138
+ (2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more
139
+ NEON registers into usage to hide latency of multiply-add instructions.
140
+ ******************************************************************************/
141
+
31
142
static inline float64x2_t set_f64x2 (double lo , double hi ) {
32
143
float64x2_t ret = vdupq_n_f64 (0 );
33
144
ret = vsetq_lane_f64 (lo , ret , 0 );
0 commit comments