Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

26 changes: 25 additions & 1 deletion ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,30 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
}
#endif

#if defined(DATA_A_TQ2_0)
// TQ2_0 ternary dequantization: {0,1,2} -> {-1,0,+1} via (q-1) mapping
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
const uint c0 = (vui >> 0) & 3;
const uint c1 = (vui >> 2) & 3;
const float q0 = float(c0) - 1.0f;
const float q1 = float(c1) - 1.0f;
return vec2(q0, q1);
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
const uint c0 = (vui >> 0) & 3;
const uint c1 = (vui >> 2) & 3;
const uint c2 = (vui >> 4) & 3;
const uint c3 = (vui >> 6) & 3;
const float q0 = float(c0) - 1.0f;
const float q1 = float(c1) - 1.0f;
const float q2 = float(c2) - 1.0f;
const float q3 = float(c3) - 1.0f;
return vec4(q0, q1, q2, q3);
}
#endif

#if defined(DATA_A_MXFP4)
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
Expand Down Expand Up @@ -461,7 +485,7 @@ vec2 get_dm(uint ib, uint a_offset) {
}
#endif

#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_TQ2_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
vec2 get_dm(uint ib, uint a_offset) {
return vec2(float(data_a[a_offset + ib].d), 0);
}
Expand Down
20 changes: 20 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,24 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
}
#endif

#if defined(DATA_A_TQ2_0)
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufTQ2_0 {
block_tq2_0 block;
};

float16_t dequantFuncTQ2_0(const in decodeBufTQ2_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];

const uint byte_idx = ((idx >> 7) << 5) + (idx & 31u);
const uint qsshift = (((idx & 127u) >> 5) << 1);

const uint c = (uint(bl.block.qs[byte_idx]) >> qsshift) & 3u;
return d * float16_t(float(c) - 1.0f);
}
#endif

#if defined(DATA_A_MXFP4)
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufMXFP4 {
block_mxfp4 block;
Expand Down Expand Up @@ -715,6 +733,8 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
#define dequantFuncA dequantFuncIQ4_XS
#elif defined(DATA_A_IQ4_NL)
#define dequantFuncA dequantFuncIQ4_NL
#elif defined(DATA_A_TQ2_0)
#define dequantFuncA dequantFuncTQ2_0
#elif defined(DATA_A_MXFP4)
#define dequantFuncA dequantFuncMXFP4
#endif
36 changes: 36 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_tq2_0.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#version 450

#extension GL_EXT_shader_16bit_storage : require

#include "types.comp"

layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};

layout (push_constant) uniform parameter {
uint ne;
} p;

layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

void main() {
const uint i = gl_GlobalInvocationID.x * 4;

if (i >= p.ne) {
return;
}

const uint ib = i / QUANT_K; // block index
const uint iqs = (i % QUANT_K) / 4; // quant index within block (byte index)
const uint bit_pos_base = (i % 4) * 2; // bit position within byte

const float d = float(data_a[ib].d);

for (uint j = 0; j < 4 && (i + j) < p.ne; ++j) {
const uint local_iqs = ((i + j) % QUANT_K) / 4; // byte index for this element
const uint bit_pos = ((i + j) % 4) * 2; // bit position for this element
const uint vui = uint(data_a[ib].qs[local_iqs]);
const uint q = (vui >> bit_pos) & 3;
data_b[i + j] = D_TYPE(d * (float(q) - 1.0f));
}
}
66 changes: 66 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_tq2_0.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#version 450
#extension GL_EXT_shader_explicit_arithmetic_types : require

#include "mul_mat_vec_base.comp"

layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];

void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);

const uint num_blocks_per_row = p.ncols / QUANT_K;

const uint tid = gl_LocalInvocationID.x;

[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[j][i] = FLOAT_TYPE(0);
}
}

[[unroll]] for (uint i = tid; i < num_blocks_per_row; i += gl_WorkGroupSize.x) {

[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row + n) * num_blocks_per_row;
const float d = float(data_a[ib0 + i].d);

[[unroll]] for (uint j = 0; j < 64; j += 32) {
[[unroll]] for (uint l = 0; l < 4; ++l) {
[[unroll]] for (uint k = 0; k < 32; ++k) {
// Extract quantized value: ((x[i].qs[j + k] >> (l*2)) & 3) - 1
const uint q_byte = uint(data_a[ib0 + i].qs[j + k]);
const uint shift = l * 2;
const uint q = (q_byte >> shift) & 3;
const FLOAT_TYPE dequant_val = FLOAT_TYPE(d * (float(q) - 1.0f)); // CPU kernel: (q-1)*d

// y-data access pattern: y[i].qs[j*4 + l*32 + k]
const uint b_idx = i * QUANT_K + j * 4 + l * 32 + k;
if (b_idx < p.ncols) {
[[unroll]] for (uint jcol = 0; jcol < NUM_COLS; ++jcol) {
temp[jcol][n] += dequant_val * FLOAT_TYPE(data_b[jcol * p.batch_stride_b + b_offset + b_idx]);
}
}
}
}
}
}
}

reduce_result(temp, d_offset, first_row, num_rows, tid);
}

void main() {
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);

if (first_row + NUM_ROWS <= p.stride_d) {
compute_outputs(first_row, NUM_ROWS);
} else {
if (first_row >= p.stride_d) {
return;
}
compute_outputs(first_row, p.stride_d - first_row);
}
}
16 changes: 16 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,22 @@ void main() {
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
#elif defined(DATA_A_TQ2_0)
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;

const uint ib = idx / 128; // 2 values per idx (like Q2_K)
const uint iqs = idx % 128; // 0..127
const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // Q2_K indexing pattern
const uint qsshift = ((iqs % 64) / 16) * 2; // Q2_K shift: 0,2,4,6

const float d = float(data_a[ib].d);

const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
const vec2 v = d * (vec2((qs >> qsshift) & 3) - 1.0f); // (q-1)*d

buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
#elif defined(DATA_A_Q2_K)
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
Expand Down
58 changes: 58 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/out_prod_tq2_0.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#version 450

#include "types.comp"
#include "generic_binary_head.comp"
#include "dequant_funcs.comp"

const uint num_threads = 256;
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;

void get_dst_indices(uint idx, out uint i20, out uint i21, out uint i22, out uint i23) {
i23 = fastdiv(idx, (p.ne22*p.ne21*p.ne20));
const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20;
i22 = fastdiv((idx - i23_offset), (p.ne21*p.ne20));
const uint i22_offset = i22*p.ne21*p.ne20;
i21 = (idx - i23_offset - i22_offset) / p.ne20;
i20 = idx - i23_offset - i22_offset - i21*p.ne20;
}

void main() {
// num_threads * num_iter must equal 512 to match the wg_denoms and get_idx
const uint num_iter = 2;

const uint broadcast2 = uint(p.param2);
const uint broadcast3 = p.param3;

uint idx = get_idx();

[[unroll]] for (uint it = 0; it < num_iter; ++it) {
if (idx < p.ne) {
uint i0, i1, i2, i3;
get_dst_indices(idx, i0, i1, i2, i3);

float acc = 0.0f;

for (uint k = 0; k < p.ne01; k += 1) {
const uint a_block_base = get_aoffset() + (i3 / broadcast3) * p.nb03 + (i2 / broadcast2) * p.nb02 + k * p.nb01;
const uint ib = a_block_base + (i0 / QUANT_K);
const uint r = (i0 % QUANT_K);
const uint iqs = (r % 32u) + 32u * (r / 128u);
const uint sub = (r % 128u) / 32u;

const vec4 v = dequantize4(ib, iqs, 0);
const vec2 dm = get_dm(ib, 0);

float qv = (sub == 0u) ? v.x : (sub == 1u) ? v.y : (sub == 2u) ? v.z : v.w;
const float a_val = qv * dm.x + dm.y;

const uint b_idx = src1_idx(i1, k, i2, i3);
const float b = data_b[get_boffset() + b_idx];
acc += a_val * b;
}

uint d_idx = dst_idx(i0, i1, i2, i3);
data_d[get_doffset() + d_idx] = acc;
}
idx += num_threads;
}
}
16 changes: 16 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/types.comp
Original file line number Diff line number Diff line change
Expand Up @@ -1355,6 +1355,22 @@ struct block_iq4_nl_packed16
#define A_TYPE_PACKED16 block_iq4_nl_packed16
#endif

// TQ2_0
#define QUANT_K_TQ2_0 256
#define QUANT_R_TQ2_0 4

struct block_tq2_0
{
uint8_t qs[QUANT_K_TQ2_0/QUANT_R_TQ2_0]; // 256/4 = 64 bytes
float16_t d;
};

#if defined(DATA_A_TQ2_0)
#define QUANT_K QUANT_K_TQ2_0
#define QUANT_R QUANT_R_TQ2_0
#define A_TYPE block_tq2_0
#endif

#define QUANT_K_MXFP4 32
#define QUANT_R_MXFP4 2

Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const std::vector<std::string> type_names = {
"q5_0",
"q5_1",
"q8_0",
"tq2_0",
"q2_k",
"q3_k",
"q4_k",
Expand Down Expand Up @@ -504,6 +505,9 @@ void process_shaders() {
// mul mat vec
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
if (tname == "tq2_0") {
shader = "mul_mat_vec_tq2_0.comp";
}

string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
Expand Down Expand Up @@ -714,6 +718,7 @@ void process_shaders() {
string_to_spv("out_prod_f16_f32", "out_prod.comp", merge_maps(base_dict, {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("out_prod_q4_0", "out_prod_q4_0.comp", merge_maps(base_dict, {{"DATA_A_Q4_0", "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("out_prod_q8_0", "out_prod_q8_0.comp", merge_maps(base_dict, {{"DATA_A_Q8_0", "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("out_prod_tq2_0", "out_prod_tq2_0.comp", merge_maps(base_dict, {{"DATA_A_TQ2_0", "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));

string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});

Expand Down
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,8 @@ if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_BUILD)
target_compile_definitions(llama PUBLIC LLAMA_SHARED)
if (ANDROID OR (UNIX AND CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
message(STATUS "Linking llama with c++_shared for Android/Termux compatibility")
target_link_libraries(llama PUBLIC c++_shared)
endif()
endif()
3 changes: 2 additions & 1 deletion tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5555,7 +5555,8 @@ static const ggml_type all_types[] = {
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K,
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
// GGML_TYPE_TQ1_0,
GGML_TYPE_TQ2_0, // TODO: implement for all backends
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
Expand Down
Loading