Skip to content

Commit c3efd7d

Browse files
committed
Revert "subgroup iq4_nl, 3% slower than original"
This reverts commit 1d949a6.
1 parent 1d949a6 commit c3efd7d

File tree

4 files changed

+8
-15
lines changed

4 files changed

+8
-15
lines changed

ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
22
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
33
#endif
4-
#extension GL_KHR_shader_subgroup_shuffle : require
5-
#extension GL_EXT_shader_subgroup_extended_types_float16 : require
64

75
#include "types.comp"
86

@@ -93,11 +91,11 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
9391
#if defined(DATA_A_IQ4_NL)
9492
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
9593
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
96-
return vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4));
94+
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
9795
}
9896
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
9997
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
100-
return vec4(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 4) & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 8) & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 12));
98+
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
10199
}
102100
#endif
103101

ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#version 450
22

33
#include "dequant_head.comp"
4-
#extension GL_KHR_shader_subgroup_shuffle : require
5-
#extension GL_EXT_shader_subgroup_extended_types_float16 : require
64

75
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
86

@@ -28,7 +26,7 @@ void main() {
2826
const float d = float(data_a[ib].d);
2927

3028
[[unroll]] for (uint l = 0; l < 8; ++l) {
31-
data_b[b_idx + l + 0] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] & 0xF));
32-
data_b[b_idx + l + 16] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] >> 4));
29+
data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
30+
data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
3331
}
3432
}

ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
#extension GL_EXT_control_flow_attributes : enable
44
#extension GL_EXT_shader_16bit_storage : require
5-
#extension GL_KHR_shader_subgroup_shuffle : require
6-
#extension GL_EXT_shader_subgroup_extended_types_float16 : require
75

86
#ifdef FLOAT16
97
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
@@ -450,7 +448,7 @@ void main() {
450448

451449
const float d = float(data_a[ib].d);
452450
const uint vui = uint(data_a[ib].qs[iqs]);
453-
const vec2 v = vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4)) * d;
451+
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
454452

455453
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
456454
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);

ggml/src/ggml-vulkan/vulkan-shaders/types.comp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#define GGML_TYPES_COMP
44

55
#extension GL_EXT_shader_explicit_arithmetic_types : require
6-
#extension GL_KHR_shader_subgroup_basic : require
76

87
#if defined(DATA_A_F32)
98
#define QUANT_K 1
@@ -306,13 +305,13 @@ const int8_t kvalues_iq4nl_const[16] = {
306305
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
307306
};
308307

309-
FLOAT_TYPE kvalues_iq4nl = FLOAT_TYPE(0);
308+
shared FLOAT_TYPE kvalues_iq4nl[16];
310309

311310
void init_iq4nl_shmem()
312311
{
313312
// copy the table into shared memory and sync
314-
if (gl_SubgroupInvocationID < 16) {
315-
kvalues_iq4nl = FLOAT_TYPE(kvalues_iq4nl_const[gl_SubgroupInvocationID]);
313+
if (gl_LocalInvocationIndex.x < 16) {
314+
kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]);
316315
}
317316
barrier();
318317
}

0 commit comments

Comments
 (0)