-
Notifications
You must be signed in to change notification settings - Fork 110
Expand file tree
/
Copy pathdispatch.h
More file actions
512 lines (502 loc) Β· 24.4 KB
/
dispatch.h
File metadata and controls
512 lines (502 loc) Β· 24.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
/**
* @brief Common Definitions for Dispatch Files.
* @file c/dispatch.h
* @author Ash Vardanian
* @date February 3, 2026
*/
#ifndef NK_DISPATCH_H
#define NK_DISPATCH_H
#define NK_DYNAMIC_DISPATCH 1
#define NK_NATIVE_F16 0
#define NK_NATIVE_BF16 0
/* NK_TARGET_* defines are set by the build system:
* - Python: setup.py
* - Rust: build.rs
* - Node.js: binding.gyp
* - CMake: CMakeLists.txt
*
* For header-only usage without a build system, types.h provides
* compiler-intrinsic-based fallback detection.
*
* OS/compiler capabilities summary:
* - Linux: everything available in GCC 12+ and Clang 16+.
* - FreeBSD: same as Linux, except AMX (no kernel tile permission support).
* - Windows - MSVC: Haswell/Skylake/Icelake, plus Sapphire FP16 (MSVC 2022 17.2+).
* - macOS - Apple Clang: only Arm NEON and x86 AVX2 Haswell extensions.
*/
#include <numkong/numkong.h>
#ifdef __cplusplus
extern "C" {
#endif
// Forward declaration of dispatch table type (same structure as in numkong.c)
typedef struct {
// Dot products
nk_metric_dense_punned_t dot_f64c;
nk_metric_dense_punned_t dot_f32c;
nk_metric_dense_punned_t dot_bf16c;
nk_metric_dense_punned_t dot_f16c;
nk_metric_dense_punned_t dot_f64;
nk_metric_dense_punned_t dot_f32;
nk_metric_dense_punned_t dot_bf16;
nk_metric_dense_punned_t dot_f16;
nk_metric_dense_punned_t dot_e5m2;
nk_metric_dense_punned_t dot_e4m3;
nk_metric_dense_punned_t dot_e3m2;
nk_metric_dense_punned_t dot_e2m3;
nk_metric_dense_punned_t dot_i8;
nk_metric_dense_punned_t dot_u8;
nk_metric_dense_punned_t dot_i4;
nk_metric_dense_punned_t dot_u4;
nk_metric_dense_punned_t dot_u1;
nk_metric_dense_punned_t vdot_f64c;
nk_metric_dense_punned_t vdot_f32c;
nk_metric_dense_punned_t vdot_bf16c;
nk_metric_dense_punned_t vdot_f16c;
// Angular distances
nk_metric_dense_punned_t angular_f64;
nk_metric_dense_punned_t angular_f32;
nk_metric_dense_punned_t angular_bf16;
nk_metric_dense_punned_t angular_f16;
nk_metric_dense_punned_t angular_e5m2;
nk_metric_dense_punned_t angular_e4m3;
nk_metric_dense_punned_t angular_e3m2;
nk_metric_dense_punned_t angular_e2m3;
nk_metric_dense_punned_t angular_i8;
nk_metric_dense_punned_t angular_i4;
nk_metric_dense_punned_t angular_u8;
nk_metric_dense_punned_t angular_u4;
// Euclidean distances
nk_metric_dense_punned_t euclidean_f64;
nk_metric_dense_punned_t euclidean_f32;
nk_metric_dense_punned_t euclidean_bf16;
nk_metric_dense_punned_t euclidean_f16;
nk_metric_dense_punned_t euclidean_e5m2;
nk_metric_dense_punned_t euclidean_e4m3;
nk_metric_dense_punned_t euclidean_e3m2;
nk_metric_dense_punned_t euclidean_e2m3;
nk_metric_dense_punned_t euclidean_i8;
nk_metric_dense_punned_t euclidean_i4;
nk_metric_dense_punned_t euclidean_u8;
nk_metric_dense_punned_t euclidean_u4;
// Squared Euclidean distances
nk_metric_dense_punned_t sqeuclidean_f64;
nk_metric_dense_punned_t sqeuclidean_f32;
nk_metric_dense_punned_t sqeuclidean_bf16;
nk_metric_dense_punned_t sqeuclidean_f16;
nk_metric_dense_punned_t sqeuclidean_e5m2;
nk_metric_dense_punned_t sqeuclidean_e4m3;
nk_metric_dense_punned_t sqeuclidean_e3m2;
nk_metric_dense_punned_t sqeuclidean_e2m3;
nk_metric_dense_punned_t sqeuclidean_i8;
nk_metric_dense_punned_t sqeuclidean_i4;
nk_metric_dense_punned_t sqeuclidean_u8;
nk_metric_dense_punned_t sqeuclidean_u4;
// Binary distances
nk_metric_dense_punned_t hamming_u8;
nk_metric_dense_punned_t hamming_u1;
nk_metric_dense_punned_t jaccard_u32;
nk_metric_dense_punned_t jaccard_u16;
nk_metric_dense_punned_t jaccard_u1;
// Curved spaces
nk_metric_curved_punned_t bilinear_f64c;
nk_metric_curved_punned_t bilinear_f32c;
nk_metric_curved_punned_t bilinear_bf16c;
nk_metric_curved_punned_t bilinear_f16c;
nk_metric_curved_punned_t bilinear_f64;
nk_metric_curved_punned_t bilinear_f32;
nk_metric_curved_punned_t bilinear_bf16;
nk_metric_curved_punned_t bilinear_f16;
nk_metric_curved_punned_t mahalanobis_f64;
nk_metric_curved_punned_t mahalanobis_f32;
nk_metric_curved_punned_t mahalanobis_bf16;
nk_metric_curved_punned_t mahalanobis_f16;
// Geospatial distances
nk_metric_geospatial_punned_t haversine_f64;
nk_metric_geospatial_punned_t haversine_f32;
nk_metric_geospatial_punned_t vincenty_f64;
nk_metric_geospatial_punned_t vincenty_f32;
// Probability distributions
nk_metric_dense_punned_t kld_f64;
nk_metric_dense_punned_t kld_f32;
nk_metric_dense_punned_t kld_bf16;
nk_metric_dense_punned_t kld_f16;
nk_metric_dense_punned_t jsd_f64;
nk_metric_dense_punned_t jsd_f32;
nk_metric_dense_punned_t jsd_bf16;
nk_metric_dense_punned_t jsd_f16;
// Mesh alignment
nk_metric_mesh_punned_t rmsd_f64;
nk_metric_mesh_punned_t rmsd_f32;
nk_metric_mesh_punned_t rmsd_bf16;
nk_metric_mesh_punned_t rmsd_f16;
nk_metric_mesh_punned_t kabsch_f64;
nk_metric_mesh_punned_t kabsch_f32;
nk_metric_mesh_punned_t kabsch_bf16;
nk_metric_mesh_punned_t kabsch_f16;
nk_metric_mesh_punned_t umeyama_f64;
nk_metric_mesh_punned_t umeyama_f32;
nk_metric_mesh_punned_t umeyama_bf16;
nk_metric_mesh_punned_t umeyama_f16;
// Sparse intersections
nk_sparse_intersect_punned_t sparse_intersect_u64;
nk_sparse_intersect_punned_t sparse_intersect_u32;
nk_sparse_intersect_punned_t sparse_intersect_u16;
// Sparse dot products
nk_sparse_dot_punned_t sparse_dot_u32f32;
nk_sparse_dot_punned_t sparse_dot_u16bf16;
// Element-wise scale
nk_each_scale_punned_t each_scale_f64c;
nk_each_scale_punned_t each_scale_f32c;
nk_each_scale_punned_t each_scale_f64;
nk_each_scale_punned_t each_scale_f32;
nk_each_scale_punned_t each_scale_bf16;
nk_each_scale_punned_t each_scale_f16;
nk_each_scale_punned_t each_scale_e5m2;
nk_each_scale_punned_t each_scale_e4m3;
nk_each_scale_punned_t each_scale_e3m2;
nk_each_scale_punned_t each_scale_e2m3;
nk_each_scale_punned_t each_scale_i64;
nk_each_scale_punned_t each_scale_i32;
nk_each_scale_punned_t each_scale_i16;
nk_each_scale_punned_t each_scale_i8;
nk_each_scale_punned_t each_scale_u64;
nk_each_scale_punned_t each_scale_u32;
nk_each_scale_punned_t each_scale_u16;
nk_each_scale_punned_t each_scale_u8;
// Element-wise sum
nk_each_sum_punned_t each_sum_f64c;
nk_each_sum_punned_t each_sum_f32c;
nk_each_sum_punned_t each_sum_f64;
nk_each_sum_punned_t each_sum_f32;
nk_each_sum_punned_t each_sum_bf16;
nk_each_sum_punned_t each_sum_f16;
nk_each_sum_punned_t each_sum_e5m2;
nk_each_sum_punned_t each_sum_e4m3;
nk_each_sum_punned_t each_sum_e3m2;
nk_each_sum_punned_t each_sum_e2m3;
nk_each_sum_punned_t each_sum_i64;
nk_each_sum_punned_t each_sum_i32;
nk_each_sum_punned_t each_sum_i16;
nk_each_sum_punned_t each_sum_i8;
nk_each_sum_punned_t each_sum_u64;
nk_each_sum_punned_t each_sum_u32;
nk_each_sum_punned_t each_sum_u16;
nk_each_sum_punned_t each_sum_u8;
// Element-wise blend
nk_each_blend_punned_t each_blend_f64c;
nk_each_blend_punned_t each_blend_f32c;
nk_each_blend_punned_t each_blend_f64;
nk_each_blend_punned_t each_blend_f32;
nk_each_blend_punned_t each_blend_bf16;
nk_each_blend_punned_t each_blend_f16;
nk_each_blend_punned_t each_blend_e5m2;
nk_each_blend_punned_t each_blend_e4m3;
nk_each_blend_punned_t each_blend_e3m2;
nk_each_blend_punned_t each_blend_e2m3;
nk_each_blend_punned_t each_blend_i64;
nk_each_blend_punned_t each_blend_i32;
nk_each_blend_punned_t each_blend_i16;
nk_each_blend_punned_t each_blend_i8;
nk_each_blend_punned_t each_blend_u64;
nk_each_blend_punned_t each_blend_u32;
nk_each_blend_punned_t each_blend_u16;
nk_each_blend_punned_t each_blend_u8;
// Element-wise FMA
nk_each_fma_punned_t each_fma_f64c;
nk_each_fma_punned_t each_fma_f32c;
nk_each_fma_punned_t each_fma_f64;
nk_each_fma_punned_t each_fma_f32;
nk_each_fma_punned_t each_fma_bf16;
nk_each_fma_punned_t each_fma_f16;
nk_each_fma_punned_t each_fma_e5m2;
nk_each_fma_punned_t each_fma_e4m3;
nk_each_fma_punned_t each_fma_e3m2;
nk_each_fma_punned_t each_fma_e2m3;
nk_each_fma_punned_t each_fma_i64;
nk_each_fma_punned_t each_fma_i32;
nk_each_fma_punned_t each_fma_i16;
nk_each_fma_punned_t each_fma_i8;
nk_each_fma_punned_t each_fma_u64;
nk_each_fma_punned_t each_fma_u32;
nk_each_fma_punned_t each_fma_u16;
nk_each_fma_punned_t each_fma_u8;
// Trigonometry
nk_kernel_trigonometry_punned_t each_sin_f64;
nk_kernel_trigonometry_punned_t each_sin_f32;
nk_kernel_trigonometry_punned_t each_sin_f16;
nk_kernel_trigonometry_punned_t each_cos_f64;
nk_kernel_trigonometry_punned_t each_cos_f32;
nk_kernel_trigonometry_punned_t each_cos_f16;
nk_kernel_trigonometry_punned_t each_atan_f64;
nk_kernel_trigonometry_punned_t each_atan_f32;
nk_kernel_trigonometry_punned_t each_atan_f16;
// Reduce moments (sum + sum-of-squares)
nk_kernel_reduce_moments_punned_t reduce_moments_f64;
nk_kernel_reduce_moments_punned_t reduce_moments_f32;
nk_kernel_reduce_moments_punned_t reduce_moments_bf16;
nk_kernel_reduce_moments_punned_t reduce_moments_f16;
nk_kernel_reduce_moments_punned_t reduce_moments_e5m2;
nk_kernel_reduce_moments_punned_t reduce_moments_e4m3;
nk_kernel_reduce_moments_punned_t reduce_moments_e3m2;
nk_kernel_reduce_moments_punned_t reduce_moments_e2m3;
nk_kernel_reduce_moments_punned_t reduce_moments_i64;
nk_kernel_reduce_moments_punned_t reduce_moments_i32;
nk_kernel_reduce_moments_punned_t reduce_moments_i16;
nk_kernel_reduce_moments_punned_t reduce_moments_i8;
nk_kernel_reduce_moments_punned_t reduce_moments_i4;
nk_kernel_reduce_moments_punned_t reduce_moments_u64;
nk_kernel_reduce_moments_punned_t reduce_moments_u32;
nk_kernel_reduce_moments_punned_t reduce_moments_u16;
nk_kernel_reduce_moments_punned_t reduce_moments_u8;
nk_kernel_reduce_moments_punned_t reduce_moments_u4;
nk_kernel_reduce_moments_punned_t reduce_moments_u1;
// Reduce minmax (min + argmin + max + argmax)
nk_kernel_reduce_minmax_punned_t reduce_minmax_f64;
nk_kernel_reduce_minmax_punned_t reduce_minmax_f32;
nk_kernel_reduce_minmax_punned_t reduce_minmax_bf16;
nk_kernel_reduce_minmax_punned_t reduce_minmax_f16;
nk_kernel_reduce_minmax_punned_t reduce_minmax_e5m2;
nk_kernel_reduce_minmax_punned_t reduce_minmax_e4m3;
nk_kernel_reduce_minmax_punned_t reduce_minmax_e3m2;
nk_kernel_reduce_minmax_punned_t reduce_minmax_e2m3;
nk_kernel_reduce_minmax_punned_t reduce_minmax_i64;
nk_kernel_reduce_minmax_punned_t reduce_minmax_i32;
nk_kernel_reduce_minmax_punned_t reduce_minmax_i16;
nk_kernel_reduce_minmax_punned_t reduce_minmax_i8;
nk_kernel_reduce_minmax_punned_t reduce_minmax_i4;
nk_kernel_reduce_minmax_punned_t reduce_minmax_u64;
nk_kernel_reduce_minmax_punned_t reduce_minmax_u32;
nk_kernel_reduce_minmax_punned_t reduce_minmax_u16;
nk_kernel_reduce_minmax_punned_t reduce_minmax_u8;
nk_kernel_reduce_minmax_punned_t reduce_minmax_u4;
nk_kernel_reduce_minmax_punned_t reduce_minmax_u1;
// Dots packed size
nk_dots_packed_size_punned_t dots_packed_size_f64;
nk_dots_packed_size_punned_t dots_packed_size_f32;
nk_dots_packed_size_punned_t dots_packed_size_bf16;
nk_dots_packed_size_punned_t dots_packed_size_f16;
nk_dots_packed_size_punned_t dots_packed_size_e5m2;
nk_dots_packed_size_punned_t dots_packed_size_e4m3;
nk_dots_packed_size_punned_t dots_packed_size_e3m2;
nk_dots_packed_size_punned_t dots_packed_size_e2m3;
nk_dots_packed_size_punned_t dots_packed_size_i8;
nk_dots_packed_size_punned_t dots_packed_size_i4;
nk_dots_packed_size_punned_t dots_packed_size_u8;
nk_dots_packed_size_punned_t dots_packed_size_u4;
nk_dots_packed_size_punned_t dots_packed_size_u1;
// Dots pack
nk_dots_pack_punned_t dots_pack_f64;
nk_dots_pack_punned_t dots_pack_f32;
nk_dots_pack_punned_t dots_pack_bf16;
nk_dots_pack_punned_t dots_pack_f16;
nk_dots_pack_punned_t dots_pack_e5m2;
nk_dots_pack_punned_t dots_pack_e4m3;
nk_dots_pack_punned_t dots_pack_e3m2;
nk_dots_pack_punned_t dots_pack_e2m3;
nk_dots_pack_punned_t dots_pack_i8;
nk_dots_pack_punned_t dots_pack_i4;
nk_dots_pack_punned_t dots_pack_u8;
nk_dots_pack_punned_t dots_pack_u4;
nk_dots_pack_punned_t dots_pack_u1;
// Dots packed
nk_dots_packed_punned_t dots_packed_f64;
nk_dots_packed_punned_t dots_packed_f32;
nk_dots_packed_punned_t dots_packed_bf16;
nk_dots_packed_punned_t dots_packed_f16;
nk_dots_packed_punned_t dots_packed_e5m2;
nk_dots_packed_punned_t dots_packed_e4m3;
nk_dots_packed_punned_t dots_packed_e3m2;
nk_dots_packed_punned_t dots_packed_e2m3;
nk_dots_packed_punned_t dots_packed_i8;
nk_dots_packed_punned_t dots_packed_i4;
nk_dots_packed_punned_t dots_packed_u8;
nk_dots_packed_punned_t dots_packed_u4;
nk_dots_packed_punned_t dots_packed_u1;
// Sets packed
nk_hammings_packed_punned_t hammings_packed_u1;
nk_jaccards_packed_punned_t jaccards_packed_u1;
// Dots symmetric
nk_dots_symmetric_punned_t dots_symmetric_f64;
nk_dots_symmetric_punned_t dots_symmetric_f32;
nk_dots_symmetric_punned_t dots_symmetric_bf16;
nk_dots_symmetric_punned_t dots_symmetric_f16;
nk_dots_symmetric_punned_t dots_symmetric_e5m2;
nk_dots_symmetric_punned_t dots_symmetric_e4m3;
nk_dots_symmetric_punned_t dots_symmetric_e3m2;
nk_dots_symmetric_punned_t dots_symmetric_e2m3;
nk_dots_symmetric_punned_t dots_symmetric_i8;
nk_dots_symmetric_punned_t dots_symmetric_i4;
nk_dots_symmetric_punned_t dots_symmetric_u8;
nk_dots_symmetric_punned_t dots_symmetric_u4;
nk_dots_symmetric_punned_t dots_symmetric_u1;
// Sets symmetric
nk_hammings_symmetric_punned_t hammings_symmetric_u1;
nk_jaccards_symmetric_punned_t jaccards_symmetric_u1;
// Angulars packed
nk_angulars_packed_punned_t angulars_packed_f64;
nk_angulars_packed_punned_t angulars_packed_f32;
nk_angulars_packed_punned_t angulars_packed_bf16;
nk_angulars_packed_punned_t angulars_packed_f16;
nk_angulars_packed_punned_t angulars_packed_e5m2;
nk_angulars_packed_punned_t angulars_packed_e4m3;
nk_angulars_packed_punned_t angulars_packed_e3m2;
nk_angulars_packed_punned_t angulars_packed_e2m3;
nk_angulars_packed_punned_t angulars_packed_i8;
nk_angulars_packed_punned_t angulars_packed_i4;
nk_angulars_packed_punned_t angulars_packed_u8;
nk_angulars_packed_punned_t angulars_packed_u4;
// Angulars symmetric
nk_angulars_symmetric_punned_t angulars_symmetric_f64;
nk_angulars_symmetric_punned_t angulars_symmetric_f32;
nk_angulars_symmetric_punned_t angulars_symmetric_bf16;
nk_angulars_symmetric_punned_t angulars_symmetric_f16;
nk_angulars_symmetric_punned_t angulars_symmetric_e5m2;
nk_angulars_symmetric_punned_t angulars_symmetric_e4m3;
nk_angulars_symmetric_punned_t angulars_symmetric_e3m2;
nk_angulars_symmetric_punned_t angulars_symmetric_e2m3;
nk_angulars_symmetric_punned_t angulars_symmetric_i8;
nk_angulars_symmetric_punned_t angulars_symmetric_i4;
nk_angulars_symmetric_punned_t angulars_symmetric_u8;
nk_angulars_symmetric_punned_t angulars_symmetric_u4;
// Euclideans packed
nk_euclideans_packed_punned_t euclideans_packed_f64;
nk_euclideans_packed_punned_t euclideans_packed_f32;
nk_euclideans_packed_punned_t euclideans_packed_bf16;
nk_euclideans_packed_punned_t euclideans_packed_f16;
nk_euclideans_packed_punned_t euclideans_packed_e5m2;
nk_euclideans_packed_punned_t euclideans_packed_e4m3;
nk_euclideans_packed_punned_t euclideans_packed_e3m2;
nk_euclideans_packed_punned_t euclideans_packed_e2m3;
nk_euclideans_packed_punned_t euclideans_packed_i8;
nk_euclideans_packed_punned_t euclideans_packed_i4;
nk_euclideans_packed_punned_t euclideans_packed_u8;
nk_euclideans_packed_punned_t euclideans_packed_u4;
// Euclideans symmetric
nk_euclideans_symmetric_punned_t euclideans_symmetric_f64;
nk_euclideans_symmetric_punned_t euclideans_symmetric_f32;
nk_euclideans_symmetric_punned_t euclideans_symmetric_bf16;
nk_euclideans_symmetric_punned_t euclideans_symmetric_f16;
nk_euclideans_symmetric_punned_t euclideans_symmetric_e5m2;
nk_euclideans_symmetric_punned_t euclideans_symmetric_e4m3;
nk_euclideans_symmetric_punned_t euclideans_symmetric_e3m2;
nk_euclideans_symmetric_punned_t euclideans_symmetric_e2m3;
nk_euclideans_symmetric_punned_t euclideans_symmetric_i8;
nk_euclideans_symmetric_punned_t euclideans_symmetric_i4;
nk_euclideans_symmetric_punned_t euclideans_symmetric_u8;
nk_euclideans_symmetric_punned_t euclideans_symmetric_u4;
// MaxSim packed size
nk_dots_packed_size_punned_t maxsim_packed_size_f32;
nk_dots_packed_size_punned_t maxsim_packed_size_bf16;
nk_dots_packed_size_punned_t maxsim_packed_size_f16;
// MaxSim pack
nk_dots_pack_punned_t maxsim_pack_f32;
nk_dots_pack_punned_t maxsim_pack_bf16;
nk_dots_pack_punned_t maxsim_pack_f16;
// MaxSim packed
nk_maxsim_packed_punned_t maxsim_packed_f32;
nk_maxsim_packed_punned_t maxsim_packed_bf16;
nk_maxsim_packed_punned_t maxsim_packed_f16;
// Type casting
nk_kernel_cast_punned_t cast;
// Scalar conversions
void (*bf16_to_f32)(nk_bf16_t const *, nk_f32_t *);
void (*f32_to_bf16)(nk_f32_t const *, nk_bf16_t *);
void (*f16_to_f32)(nk_f16_t const *, nk_f32_t *);
void (*f32_to_f16)(nk_f32_t const *, nk_f16_t *);
void (*e5m2_to_f32)(nk_e5m2_t const *, nk_f32_t *);
void (*f32_to_e5m2)(nk_f32_t const *, nk_e5m2_t *);
void (*e4m3_to_f32)(nk_e4m3_t const *, nk_f32_t *);
void (*f32_to_e4m3)(nk_f32_t const *, nk_e4m3_t *);
void (*e3m2_to_f32)(nk_e3m2_t const *, nk_f32_t *);
void (*f32_to_e3m2)(nk_f32_t const *, nk_e3m2_t *);
void (*e2m3_to_f32)(nk_e2m3_t const *, nk_f32_t *);
void (*f32_to_e2m3)(nk_f32_t const *, nk_e2m3_t *);
// Scalar math
nk_f64_t (*f64_sqrt)(nk_f64_t);
nk_f64_t (*f64_rsqrt)(nk_f64_t);
nk_f64_t (*f64_fma)(nk_f64_t, nk_f64_t, nk_f64_t);
nk_f32_t (*f32_sqrt)(nk_f32_t);
nk_f32_t (*f32_rsqrt)(nk_f32_t);
nk_f32_t (*f32_fma)(nk_f32_t, nk_f32_t, nk_f32_t);
nk_f16_t (*f16_sqrt)(nk_f16_t);
nk_f16_t (*f16_rsqrt)(nk_f16_t);
nk_f16_t (*f16_fma)(nk_f16_t, nk_f16_t, nk_f16_t);
// Scalar saturating arithmetic
nk_i64_t (*i64_saturating_add)(nk_i64_t, nk_i64_t);
nk_i64_t (*i64_saturating_mul)(nk_i64_t, nk_i64_t);
nk_i32_t (*i32_saturating_add)(nk_i32_t, nk_i32_t);
nk_i32_t (*i32_saturating_mul)(nk_i32_t, nk_i32_t);
nk_i16_t (*i16_saturating_add)(nk_i16_t, nk_i16_t);
nk_i16_t (*i16_saturating_mul)(nk_i16_t, nk_i16_t);
nk_i8_t (*i8_saturating_add)(nk_i8_t, nk_i8_t);
nk_i8_t (*i8_saturating_mul)(nk_i8_t, nk_i8_t);
nk_i4x2_t (*i4x2_saturating_add)(nk_i4x2_t, nk_i4x2_t);
nk_i4x2_t (*i4x2_saturating_mul)(nk_i4x2_t, nk_i4x2_t);
nk_u64_t (*u64_saturating_add)(nk_u64_t, nk_u64_t);
nk_u64_t (*u64_saturating_mul)(nk_u64_t, nk_u64_t);
nk_u32_t (*u32_saturating_add)(nk_u32_t, nk_u32_t);
nk_u32_t (*u32_saturating_mul)(nk_u32_t, nk_u32_t);
nk_u16_t (*u16_saturating_add)(nk_u16_t, nk_u16_t);
nk_u16_t (*u16_saturating_mul)(nk_u16_t, nk_u16_t);
nk_u8_t (*u8_saturating_add)(nk_u8_t, nk_u8_t);
nk_u8_t (*u8_saturating_mul)(nk_u8_t, nk_u8_t);
nk_u4x2_t (*u4x2_saturating_add)(nk_u4x2_t, nk_u4x2_t);
nk_u4x2_t (*u4x2_saturating_mul)(nk_u4x2_t, nk_u4x2_t);
// Scalar ordering
int (*bf16_order)(nk_bf16_t, nk_bf16_t);
int (*f16_order)(nk_f16_t, nk_f16_t);
int (*e5m2_order)(nk_e5m2_t, nk_e5m2_t);
int (*e4m3_order)(nk_e4m3_t, nk_e4m3_t);
int (*e3m2_order)(nk_e3m2_t, nk_e3m2_t);
int (*e2m3_order)(nk_e2m3_t, nk_e2m3_t);
} nk_implementations_t;
// Global dispatch table - defined in numkong.c
extern nk_implementations_t nk_dispatch_table;
// Error handlers - defined in numkong.c
extern void nk_error_dense_(void const *, void const *, nk_size_t, void *);
extern void nk_error_sparse_intersect_(void const *, void const *, nk_size_t, nk_size_t, void *, nk_size_t *);
extern void nk_error_sparse_dot_(void const *, void const *, void const *, void const *, nk_size_t, nk_size_t, void *);
extern void nk_error_curved_(void const *, void const *, void const *, nk_size_t, void *);
extern void nk_error_geospatial_(void const *, void const *, void const *, void const *, nk_size_t, void *);
extern void nk_error_each_fma_(void const *, void const *, void const *, nk_size_t, void const *, void const *, void *);
extern void nk_error_each_blend_(void const *, void const *, nk_size_t, void const *, void const *, void *);
extern void nk_error_each_scale_(void const *, nk_size_t, void const *, void const *, void *);
extern void nk_error_each_sum_(void const *, void const *, nk_size_t, void *);
extern void nk_error_trigonometry_(void const *, nk_size_t, void *);
extern void nk_error_mesh_(void const *, void const *, nk_size_t, void *, void *, void *, void *, void *);
extern void nk_error_reduce_moments_(void const *, nk_size_t, nk_size_t, void *, void *);
extern void nk_error_reduce_minmax_(void const *, nk_size_t, nk_size_t, void *, nk_size_t *, void *, nk_size_t *);
extern nk_size_t nk_error_packed_size_(nk_size_t, nk_size_t);
extern void nk_error_pack_(void const *, nk_size_t, nk_size_t, nk_size_t, void *);
extern void nk_error_dots_(void const *, void const *, void *, nk_size_t, nk_size_t, nk_size_t, nk_size_t, nk_size_t);
extern void nk_error_dots_symmetric_(void const *, nk_size_t, nk_size_t, nk_size_t, void *, nk_size_t, nk_size_t,
nk_size_t);
// Dtype-specific kernel lookup functions
extern void nk_dispatch_f64c_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_f32c_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_bf16c_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_f16c_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_f64_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_f32_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_bf16_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_f16_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_e5m2_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_e4m3_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_e3m2_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_e2m3_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_i64_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_i32_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_i16_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_i8_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_i4_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_u64_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_u32_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_u16_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_u8_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_u4_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_u1_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
extern void nk_dispatch_cast_find_(nk_capability_t, nk_kernel_kind_t, nk_kernel_punned_t *, nk_capability_t *);
#ifdef __cplusplus
}
#endif
#endif // NK_DISPATCH_H