@@ -24,10 +24,6 @@ extern "C" {
24
24
#include " promoters.hpp"
25
25
#include " ../quadblas_interface.h"
26
26
27
- /* *
28
- * Resolve descriptors for matmul operation.
29
- * Only supports SLEEF backend when QBLAS is enabled.
30
- */
31
27
static NPY_CASTING
32
28
quad_matmul_resolve_descriptors (PyObject *self, PyArray_DTypeMeta *const dtypes[],
33
29
PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
@@ -76,23 +72,15 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
76
72
return casting;
77
73
}
78
74
79
- /* *
80
- * Determine the type of operation based on input dimensions
81
- */
82
75
enum MatmulOperationType {
83
- MATMUL_DOT, // 1D x 1D -> scalar
84
- MATMUL_GEMV, // 2D x 1D -> 1D
85
- MATMUL_GEMM // 2D x 2D -> 2D
76
+ MATMUL_DOT,
77
+ MATMUL_GEMV,
78
+ MATMUL_GEMM
86
79
};
87
80
88
81
static MatmulOperationType
89
82
determine_operation_type (npy_intp m, npy_intp n, npy_intp p)
90
83
{
91
- // For matmul signature (m?,n),(n,p?)->(m?,p?):
92
- // - If m=1 and p=1: vector dot product (1D x 1D)
93
- // - If p=1: matrix-vector multiplication (2D x 1D)
94
- // - Otherwise: matrix-matrix multiplication (2D x 2D)
95
-
96
84
if (m == 1 && p == 1 ) {
97
85
return MATMUL_DOT;
98
86
}
@@ -104,10 +92,6 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
104
92
}
105
93
}
106
94
107
- /* *
108
- * Matrix multiplication strided loop using QBLAS.
109
- * Automatically selects the appropriate QBLAS operation based on input dimensions.
110
- */
111
95
static int
112
96
quad_matmul_strided_loop (PyArrayMethod_Context *context, char *const data[],
113
97
npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
@@ -118,43 +102,29 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
118
102
npy_intp n = dimensions[2 ]; // Cols of first matrix / rows of second matrix
119
103
npy_intp p = dimensions[3 ]; // Cols of second matrix
120
104
121
- // Extract batch strides
105
+ // batch strides
122
106
npy_intp A_stride = strides[0 ];
123
107
npy_intp B_stride = strides[1 ];
124
108
npy_intp C_stride = strides[2 ];
125
109
126
- // Extract core strides for matrix dimensions
110
+ // core strides for matrix dimensions
127
111
npy_intp A_row_stride = strides[3 ];
128
112
npy_intp A_col_stride = strides[4 ];
129
113
npy_intp B_row_stride = strides[5 ];
130
114
npy_intp B_col_stride = strides[6 ];
131
115
npy_intp C_row_stride = strides[7 ];
132
116
npy_intp C_col_stride = strides[8 ];
133
117
134
- // Note: B_col_stride and C_col_stride not needed for row-major QBLAS calls
135
-
136
- // Get backend from descriptor (should be SLEEF only)
137
118
QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors [0 ];
138
119
if (descr->backend != BACKEND_SLEEF) {
139
120
PyErr_SetString (PyExc_RuntimeError, " Internal error: non-SLEEF backend in QBLAS matmul" );
140
121
return -1 ;
141
122
}
142
123
143
- // Determine operation type
144
124
MatmulOperationType op_type = determine_operation_type (m, n, p);
145
-
146
- // Constants for QBLAS
147
125
Sleef_quad alpha = Sleef_cast_from_doubleq1 (1.0 );
148
126
Sleef_quad beta = Sleef_cast_from_doubleq1 (0.0 );
149
127
150
- // print all information for debugging
151
- printf (" DEBUG: Performing %ld batch operations with dimensions (%ld, %ld, %ld)\n " , (long )N,
152
- (long )m, (long )n, (long )p);
153
- printf (" DEBUG: Strides - A: (%ld, %ld), B: (%ld, %ld), C: (%ld, %ld)\n " , (long )A_row_stride,
154
- (long )A_col_stride, (long )B_row_stride, (long )B_col_stride, (long )C_row_stride,
155
- (long )C_col_stride);
156
- printf (" DEBUG: Operation type: %d\n " , op_type);
157
-
158
128
char *A = data[0 ];
159
129
char *B = data[1 ];
160
130
char *C = data[2 ];
@@ -167,13 +137,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
167
137
168
138
switch (op_type) {
169
139
case MATMUL_DOT: {
170
- // Vector dot product: C = A^T * B (both are vectors)
171
- // A has shape (1, n), B has shape (n, 1), C has shape (1, 1)
172
-
173
- printf (" DEBUG: Using QBLAS dot product for %ld elements\n " , (long )n);
174
-
175
- // A is effectively a vector of length n
176
- // B is effectively a vector of length n
177
140
size_t incx = A_col_stride / sizeof (Sleef_quad);
178
141
size_t incy = B_row_stride / sizeof (Sleef_quad);
179
142
@@ -182,12 +145,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
182
145
}
183
146
184
147
case MATMUL_GEMV: {
185
- // Matrix-vector multiplication: C = A * B
186
- // A has shape (m, n), B has shape (n, 1), C has shape (m, 1)
187
-
188
- printf (" DEBUG: Using QBLAS GEMV for %ldx%ld matrix times %ld vector\n " , (long )m,
189
- (long )n, (long )n);
190
-
191
148
size_t lda = A_row_stride / sizeof (Sleef_quad);
192
149
size_t incx = B_row_stride / sizeof (Sleef_quad);
193
150
size_t incy = C_row_stride / sizeof (Sleef_quad);
@@ -198,17 +155,46 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
198
155
}
199
156
200
157
case MATMUL_GEMM: {
201
- // Matrix-matrix multiplication: C = A * B
202
- // A has shape (m, n), B has shape (n, p), C has shape (m, p)
203
-
204
- printf (" DEBUG: Using QBLAS GEMM for %ldx%ldx%ld matrices\n " , (long )m, (long )n, (long )p);
205
-
206
158
size_t lda = A_row_stride / sizeof (Sleef_quad);
207
159
size_t ldb = B_row_stride / sizeof (Sleef_quad);
208
- size_t ldc = C_row_stride / sizeof (Sleef_quad);
160
+ size_t ldc_numpy = C_row_stride / sizeof (Sleef_quad);
161
+
162
+ Sleef_quad *temp_A_buffer = new Sleef_quad[m * n];
163
+ if (!temp_A_buffer) {
164
+ PyErr_SetString (PyExc_MemoryError, " Failed to allocate temporary buffer for GEMM" );
165
+ delete[] temp_A_buffer;
166
+ return -1 ;
167
+ }
168
+ Sleef_quad *temp_B_buffer = new Sleef_quad[n * p];
169
+ if (!temp_B_buffer) {
170
+ PyErr_SetString (PyExc_MemoryError, " Failed to allocate temporary buffer for GEMM" );
171
+ delete[] temp_A_buffer;
172
+ return -1 ;
173
+ }
174
+ memcpy (temp_A_buffer, A_ptr, m * n * sizeof (Sleef_quad));
175
+ memcpy (temp_B_buffer, B_ptr, n * p * sizeof (Sleef_quad));
176
+ A_ptr = temp_A_buffer;
177
+ B_ptr = temp_B_buffer;
178
+
179
+ Sleef_quad *temp_C_buffer = new Sleef_quad[m * p];
180
+ if (!temp_C_buffer) {
181
+ PyErr_SetString (PyExc_MemoryError,
182
+ " Failed to allocate temporary buffer for GEMM result" );
183
+ return -1 ;
184
+ }
185
+
186
+ size_t ldc_temp = p;
209
187
210
188
result = qblas_gemm (' R' , ' N' , ' N' , m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
211
- C_ptr, ldc);
189
+ temp_C_buffer, ldc_temp);
190
+
191
+ if (result == 0 ) {
192
+ memcpy (C_ptr, temp_C_buffer, m * p * sizeof (Sleef_quad));
193
+ }
194
+
195
+ delete[] temp_C_buffer;
196
+ delete[] temp_A_buffer;
197
+ delete[] temp_B_buffer;
212
198
break ;
213
199
}
214
200
}
@@ -221,27 +207,91 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
221
207
return 0 ;
222
208
}
223
209
224
- /* *
225
- * Register matmul support with QBLAS acceleration
226
- */
210
+ static int
211
+ naive_matmul_strided_loop (PyArrayMethod_Context *context, char *const data[],
212
+ npy_intp const dimensions[], npy_intp const strides[],
213
+ NpyAuxData *auxdata)
214
+ {
215
+ npy_intp N = dimensions[0 ];
216
+ npy_intp m = dimensions[1 ];
217
+ npy_intp n = dimensions[2 ];
218
+ npy_intp p = dimensions[3 ];
219
+
220
+ npy_intp A_batch_stride = strides[0 ];
221
+ npy_intp B_batch_stride = strides[1 ];
222
+ npy_intp C_batch_stride = strides[2 ];
223
+
224
+ npy_intp A_row_stride = strides[3 ];
225
+ npy_intp A_col_stride = strides[4 ];
226
+ npy_intp B_row_stride = strides[5 ];
227
+ npy_intp B_col_stride = strides[6 ];
228
+ npy_intp C_row_stride = strides[7 ];
229
+ npy_intp C_col_stride = strides[8 ];
230
+
231
+ QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors [0 ];
232
+ QuadBackendType backend = descr->backend ;
233
+ size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof (Sleef_quad) : sizeof (long double );
234
+
235
+ for (npy_intp batch = 0 ; batch < N; batch++) {
236
+ char *A_batch = data[0 ] + batch * A_batch_stride;
237
+ char *B_batch = data[1 ] + batch * B_batch_stride;
238
+ char *C_batch = data[2 ] + batch * C_batch_stride;
239
+
240
+ for (npy_intp i = 0 ; i < m; i++) {
241
+ for (npy_intp j = 0 ; j < p; j++) {
242
+ char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
243
+
244
+ if (backend == BACKEND_SLEEF) {
245
+ Sleef_quad sum = Sleef_cast_from_doubleq1 (0.0 );
246
+
247
+ for (npy_intp k = 0 ; k < n; k++) {
248
+ char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
249
+ char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
250
+
251
+ Sleef_quad a_val, b_val;
252
+ memcpy (&a_val, A_ik, sizeof (Sleef_quad));
253
+ memcpy (&b_val, B_kj, sizeof (Sleef_quad));
254
+ sum = Sleef_fmaq1_u05 (a_val, b_val, sum);
255
+ }
256
+
257
+ memcpy (C_ij, &sum, sizeof (Sleef_quad));
258
+ }
259
+ else {
260
+ long double sum = 0 .0L ;
261
+
262
+ for (npy_intp k = 0 ; k < n; k++) {
263
+ char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
264
+ char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
265
+
266
+ long double a_val, b_val;
267
+ memcpy (&a_val, A_ik, sizeof (long double ));
268
+ memcpy (&b_val, B_kj, sizeof (long double ));
269
+
270
+ sum += a_val * b_val;
271
+ }
272
+
273
+ memcpy (C_ij, &sum, sizeof (long double ));
274
+ }
275
+ }
276
+ }
277
+ }
278
+
279
+ return 0 ;
280
+ }
281
+
227
282
int
228
283
init_matmul_ops (PyObject *numpy)
229
284
{
230
- printf (" DEBUG: init_matmul_ops - registering QBLAS-accelerated matmul\n " );
231
-
232
- // Get the existing matmul ufunc
233
285
PyObject *ufunc = PyObject_GetAttrString (numpy, " matmul" );
234
286
if (ufunc == NULL ) {
235
- printf (" DEBUG: Failed to get numpy.matmul\n " );
236
287
return -1 ;
237
288
}
238
289
239
- // Setup method specification for QBLAS-accelerated matmul
240
290
PyArray_DTypeMeta *dtypes[3 ] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
241
291
242
292
PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
243
- {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop },
244
- {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop },
293
+ {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop },
294
+ {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop },
245
295
{0 , NULL }};
246
296
247
297
PyArrayMethod_Spec Spec = {
@@ -254,17 +304,11 @@ init_matmul_ops(PyObject *numpy)
254
304
.slots = slots,
255
305
};
256
306
257
- printf (" DEBUG: About to add QBLAS loop to matmul ufunc...\n " );
258
-
259
307
if (PyUFunc_AddLoopFromSpec (ufunc, &Spec) < 0 ) {
260
- printf (" DEBUG: Failed to add QBLAS loop to matmul ufunc\n " );
261
308
Py_DECREF (ufunc);
262
309
return -1 ;
263
310
}
264
311
265
- printf (" DEBUG: Successfully added QBLAS matmul loop!\n " );
266
-
267
- // Add promoter
268
312
PyObject *promoter_capsule =
269
313
PyCapsule_New ((void *)&quad_ufunc_promoter, " numpy._ufunc_promoter" , NULL );
270
314
if (promoter_capsule == NULL ) {
@@ -280,17 +324,14 @@ init_matmul_ops(PyObject *numpy)
280
324
}
281
325
282
326
if (PyUFunc_AddPromoter (ufunc, DTypes, promoter_capsule) < 0 ) {
283
- printf (" DEBUG: Failed to add promoter (continuing anyway)\n " );
284
327
PyErr_Clear (); // Don't fail if promoter fails
285
328
}
286
329
else {
287
- printf (" DEBUG: Successfully added promoter\n " );
288
330
}
289
331
290
332
Py_DECREF (DTypes);
291
333
Py_DECREF (promoter_capsule);
292
334
Py_DECREF (ufunc);
293
335
294
- printf (" DEBUG: init_matmul_ops completed successfully with QBLAS acceleration\n " );
295
336
return 0 ;
296
337
}
0 commit comments