|
1 | 1 | //------------------------------------------------------------------------------ |
2 | | -// This file is contains additional kernels for data conversion. |
| 2 | +// This file is contains kernels for data conversion. |
3 | 3 | // These kernels are used when loading the model, so its performance is less |
4 | 4 | // important. |
5 | 5 | //------------------------------------------------------------------------------ |
6 | | -#ifdef cl_khr_fp16 |
7 | 6 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable |
8 | | -#elif defined(cl_amd_fp16) |
9 | | -#pragma OPENCL EXTENSION cl_amd_fp16 : enable |
10 | | -#else |
11 | | -#error "Half precision floating point not supportedby OpenCL implementation on your device." |
12 | | -#endif |
13 | | - |
14 | | -#ifdef cl_khr_subgroups |
15 | | -#pragma OPENCL EXTENSION cl_khr_subgroups : enable |
16 | | -#elif defined(cl_intel_subgroups) |
17 | | -#pragma OPENCL EXTENSION cl_intel_subgroups : enable |
18 | | -#else |
19 | | -#error "Subgroup not supported on your device." |
20 | | -#endif |
21 | 7 |
|
22 | 8 | #ifdef cl_intel_required_subgroup_size |
23 | | -// Always use subgroup size of 32 on Intel. |
24 | 9 | #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable |
25 | 10 | #define INTEL_GPU 1 |
26 | 11 | #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) |
27 | 12 | #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) |
28 | 13 | #elif defined(cl_qcom_reqd_sub_group_size) |
29 | | -// Always use subgroups size of 64 on Adreno. |
30 | 14 | #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable |
31 | 15 | #define ADRENO_GPU 1 |
32 | 16 | #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) |
33 | 17 | #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) |
34 | | -#else |
35 | | -// TODO: do not know how to choose subgroup size on other GPUs. |
36 | | -#error "Selecting subgroup size is not supported on your device." |
37 | 18 | #endif |
38 | 19 |
|
39 | 20 | #define QK4_0 32 |
|
0 commit comments