|
1 | 1 | //------------------------------------------------------------------------------
|
2 |
| -// This file is contains additional kernels for data conversion. |
| 2 | +// This file is contains kernels for data conversion. |
3 | 3 | // These kernels are used when loading the model, so its performance is less
|
4 | 4 | // important.
|
5 | 5 | //------------------------------------------------------------------------------
|
6 |
| -#ifdef cl_khr_fp16 |
7 | 6 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
8 |
| -#elif defined(cl_amd_fp16) |
9 |
| -#pragma OPENCL EXTENSION cl_amd_fp16 : enable |
10 |
| -#else |
11 |
| -#error "Half precision floating point not supportedby OpenCL implementation on your device." |
12 |
| -#endif |
13 |
| - |
14 |
| -#ifdef cl_khr_subgroups |
15 |
| -#pragma OPENCL EXTENSION cl_khr_subgroups : enable |
16 |
| -#elif defined(cl_intel_subgroups) |
17 |
| -#pragma OPENCL EXTENSION cl_intel_subgroups : enable |
18 |
| -#else |
19 |
| -#error "Subgroup not supported on your device." |
20 |
| -#endif |
21 | 7 |
|
22 | 8 | #ifdef cl_intel_required_subgroup_size
|
23 |
| -// Always use subgroup size of 32 on Intel. |
24 | 9 | #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
25 | 10 | #define INTEL_GPU 1
|
26 | 11 | #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
27 | 12 | #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
28 | 13 | #elif defined(cl_qcom_reqd_sub_group_size)
|
29 |
| -// Always use subgroups size of 64 on Adreno. |
30 | 14 | #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
31 | 15 | #define ADRENO_GPU 1
|
32 | 16 | #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
33 | 17 | #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
34 |
| -#else |
35 |
| -// TODO: do not know how to choose subgroup size on other GPUs. |
36 |
| -#error "Selecting subgroup size is not supported on your device." |
37 | 18 | #endif
|
38 | 19 |
|
39 | 20 | #define QK4_0 32
|
|
0 commit comments