11#define (VARIANTS )
22
33[
4+ {
5+ "REPLS ": {
6+ "TYPE " : "vec4 <f32 >",
7+ "TYPE_SUFFIX ": "f32_vec ",
8+ "DST_TYPE ": "vec4 <f32 >",
9+ "BLOCK_SIZE ": 4
10+ },
11+ "DECLS ": ["F32_VEC "]
12+ },
413 {
514 "REPLS ": {
615 "TYPE " : "f32 ",
16+ "DST_TYPE ": "f32 ",
717 "BLOCK_SIZE ": 1
818 },
9- "DECLS ": ["FLOAT "]
19+ "DECLS ": ["F32 "]
1020 },
1121 {
1222 "REPLS ": {
1323 "TYPE " : "f16 ",
24+ "DST_TYPE ": "f32 ",
1425 "BLOCK_SIZE ": 1
1526 },
16- "DECLS ": ["FLOAT "]
27+ "DECLS ": ["F16 "]
1728 },
1829 {
1930 "REPLS ": {
2031 "TYPE " : "i32 ",
32+ "DST_TYPE ": "i32 ",
2133 "BLOCK_SIZE ": 1
2234 },
23- "DECLS ": ["FLOAT "]
35+ "DECLS ": ["I32 "]
2436 },
2537 {
2638 "REPLS ": {
2739 "TYPE " : "q4_0 ",
40+ "DST_TYPE ": "f32 ",
2841 "BLOCK_SIZE ": 32
2942 },
3043 "DECLS ": ["BYTE_HELPERS ", "Q4_0_T ", "Q4_0 "]
3144 },
3245 {
3346 "REPLS ": {
3447 "TYPE " : "q4_1 ",
48+ "DST_TYPE ": "f32 ",
3549 "BLOCK_SIZE ": 32
3650 },
3751 "DECLS ": ["BYTE_HELPERS ", "Q4_1_T ", "Q4_1 "]
3852 },
3953 {
4054 "REPLS ": {
4155 "TYPE " : "q5_0 ",
56+ "DST_TYPE ": "f32 ",
4257 "BLOCK_SIZE ": 32
4358 },
4459 "DECLS ": ["BYTE_HELPERS ", "Q5_0_T ", "Q5_0 "]
4560 },
4661 {
4762 "REPLS ": {
4863 "TYPE " : "q5_1 ",
64+ "DST_TYPE ": "f32 ",
4965 "BLOCK_SIZE ": 32
5066 },
5167 "DECLS ": ["BYTE_HELPERS ", "Q5_1_T ", "Q5_1 "]
5268 },
5369 {
5470 "REPLS ": {
5571 "TYPE " : "q8_0 ",
72+ "DST_TYPE ": "f32 ",
5673 "BLOCK_SIZE ": 32
5774 },
5875 "DECLS ": ["BYTE_HELPERS ", "Q8_0_T ", "Q8_0 "]
5976 },
6077 {
6178 "REPLS ": {
6279 "TYPE " : "q2_k ",
80+ "DST_TYPE ": "f32 ",
6381 "BLOCK_SIZE ": 256
6482 },
6583 "DECLS ": ["BYTE_HELPERS ", "Q2_K_T ", "Q2_K "]
6684 },
6785 {
6886 "REPLS ": {
6987 "TYPE " : "q3_k ",
88+ "DST_TYPE ": "f32 ",
7089 "BLOCK_SIZE ": 256
7190 },
7291 "DECLS ": ["BYTE_HELPERS ", "Q3_K_T ", "Q3_K "]
7392 },
7493 {
7594 "REPLS ": {
7695 "TYPE " : "q4_k ",
96+ "DST_TYPE ": "f32 ",
7797 "BLOCK_SIZE ": 256
7898 },
7999 "DECLS ": ["Q45_K_SCALE_MIN ", "BYTE_HELPERS ", "Q4_K_T ", "Q4_K "]
80100 },
81101 {
82102 "REPLS ": {
83103 "TYPE " : "q5_k ",
104+ "DST_TYPE ": "f32 ",
84105 "BLOCK_SIZE ": 256
85106 },
86107 "DECLS ": ["Q45_K_SCALE_MIN ", "BYTE_HELPERS ", "Q5_K_T ", "Q5_K "]
87108 },
88109 {
89110 "REPLS ": {
90111 "TYPE " : "q6_k ",
112+ "DST_TYPE ": "f32 ",
91113 "BLOCK_SIZE ": 256
92114 },
93115 "DECLS ": ["BYTE_HELPERS ", "Q6_K_T ", "Q6_K "]
94116 },
95117 {
96118 "REPLS ": {
97119 "TYPE " : "iq2_xxs ",
120+ "DST_TYPE ": "f32 ",
98121 "BLOCK_SIZE ": 256
99122 },
100123 "DECLS ": ["BYTE_HELPERS ", "IQ23_TABLES ", "IQ2_XXS_GRID ", "IQ2_XXS_T ", "IQ2_XXS "]
101124 },
102125 {
103126 "REPLS ": {
104127 "TYPE " : "iq2_xs ",
128+ "DST_TYPE ": "f32 ",
105129 "BLOCK_SIZE ": 256
106130 },
107131 "DECLS ": ["BYTE_HELPERS ", "IQ23_TABLES ", "IQ2_XS_GRID ", "IQ2_XS_T ", "IQ2_XS "]
108132 },
109133 {
110134 "REPLS ": {
111135 "TYPE ": "iq2_s ",
136+ "DST_TYPE ": "f32 ",
112137 "BLOCK_SIZE ": 256
113138 },
114139 "DECLS ": ["BYTE_HELPERS ", "IQ23_TABLES ", "IQ2_S_GRID ", "IQ2_S_T ", "IQ2_S "]
115140 },
116141 {
117142 "REPLS ": {
118143 "TYPE ": "iq3_xxs ",
144+ "DST_TYPE ": "f32 ",
119145 "BLOCK_SIZE ": 256
120146 },
121147 "DECLS ": ["BYTE_HELPERS ", "IQ23_TABLES ", "IQ3_XSS_GRID ", "IQ3_XSS_T ", "IQ3_XSS "]
122148 },
123149 {
124150 "REPLS ": {
125151 "TYPE ": "iq3_s ",
152+ "DST_TYPE ": "f32 ",
126153 "BLOCK_SIZE ": 256
127154 },
128155 "DECLS ": ["BYTE_HELPERS ", "IQ23_TABLES ", "IQ3_S_GRID ", "IQ3_S_T ", "IQ3_S "]
129156 },
130157 {
131158 "REPLS ": {
132159 "TYPE ": "iq1_s ",
160+ "DST_TYPE ": "f32 ",
133161 "BLOCK_SIZE ": 256
134162 },
135163 "DECLS ": ["BYTE_HELPERS ", "IQ1_GRID ", "IQ1_S_T ", "IQ1_S "]
136164 },
137165 {
138166 "REPLS ": {
139167 "TYPE ": "iq1_m ",
168+ "DST_TYPE ": "f32 ",
140169 "BLOCK_SIZE ": 256
141170 },
142171 "DECLS ": ["BYTE_HELPERS ", "IQ1_GRID ", "IQ1_M_T ", "IQ1_M "]
143172 },
144173 {
145174 "REPLS ": {
146175 "TYPE ": "iq4_nl ",
176+ "DST_TYPE ": "f32 ",
147177 "BLOCK_SIZE ": 32 ,
148178 },
149179 "DECLS ": ["BYTE_HELPERS ", "IQ4_GRID ", "IQ4_NL_T ", "IQ4_NL "]
150180 },
151181 {
152182 "REPLS ": {
153183 "TYPE ": "iq4_xs ",
184+ "DST_TYPE ": "f32 ",
154185 "BLOCK_SIZE ": 256 ,
155186 },
156187 "DECLS ": ["BYTE_HELPERS ", "IQ4_GRID ", "IQ4_XS_T ", "IQ4_XS "]
161192
162193#define (DECLS )
163194
164- #decl (FLOAT )
195+ #decl (F32_VEC )
196+ fn copy_elements (src_base : u32 , dst_base : u32 , offset : u32 ) {
197+ dst [(dst_base / 4 ) + offset ] = src [(src_base / 4 ) + offset ];
198+ }
199+ #enddecl (F32_VEC )
200+
201+ #decl (F32 )
202+ fn copy_elements (src_base : u32 , dst_base : u32 , offset : u32 ) {
203+ dst [dst_base + offset ] = src [src_base + offset ];
204+ }
205+ #enddecl (F32 )
206+
207+ #decl (F16 )
165208fn copy_elements (src_base : u32 , dst_base : u32 , offset : u32 ) {
166209 dst [dst_base + offset ] = f32 (src [src_base + offset ]);
167210}
168- #enddecl (FLOAT )
211+ #enddecl (F16 )
212+
213+ #decl (I32 )
214+ fn copy_elements (src_base : u32 , dst_base : u32 , offset : u32 ) {
215+ dst [dst_base + offset ] = src [src_base + offset ];
216+ }
217+ #enddecl (I32 )
169218
170219#decl (Q4_0 )
171220fn copy_elements (src_base : u32 , dst_base : u32 , offset : u32 ) {
@@ -759,7 +808,7 @@ var<storage, read_write> src: array<{{TYPE}}>;
759808var <storage , read_write > idx : array <i32 >;
760809
761810@group (0 ) @binding (2 )
762- var <storage , read_write > dst : array <f32 >;
811+ var <storage , read_write > dst : array <{{ DST_TYPE }} >;
763812
764813struct Params {
765814 offset_src : u32 , // in elements
@@ -822,4 +871,4 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
822871 }
823872}
824873
825- #end (SHADER )
874+ #end (SHADER )
0 commit comments