Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ jobs:
cd build
export GGML_VK_VISIBLE_DEVICES=0
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4200
ctest -L main --verbose --timeout 6200
ubuntu-22-cmake-webgpu:
runs-on: ubuntu-22.04
Expand Down
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics ou
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_BUILD_ADRENO_SHADERS "ggml: build Adreno-supported shader variants" ON)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-vulkan/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_DEBUG)
endif()

if (GGML_VULKAN_BUILD_ADRENO_SHADERS)
add_compile_definitions(GGML_VULKAN_BUILD_ADRENO_SHADERS)
list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DGGML_VULKAN_BUILD_ADRENO_SHADERS=ON)
endif()

if (GGML_VULKAN_MEMORY_DEBUG)
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
endif()
Expand Down
598 changes: 564 additions & 34 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
cmake_minimum_required(VERSION 3.19)
project("vulkan-shaders-gen" C CXX)

option(GGML_VULKAN_BUILD_ADRENO_SHADERS "Build Adreno-specific shader variants" ON)

find_package (Threads REQUIRED)

if (GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
Expand All @@ -23,6 +25,10 @@ if (GGML_VULKAN_SHADER_DEBUG_INFO)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
message(STATUS "Enabling shader debug info")
endif()
if (GGML_VULKAN_BUILD_ADRENO_SHADERS)
add_compile_definitions(GGML_VULKAN_BUILD_ADRENO_SHADERS)
message(STATUS "Building Adreno-specific shaders")
endif()

set(TARGET vulkan-shaders-gen)
add_executable(${TARGET} vulkan-shaders-gen.cpp)
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,13 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
#if defined(ADRENO)
const vec2 v0 = dequantize(ib, iqs, a_offset);
const vec2 v1 = dequantize(ib, iqs + 2, a_offset);
#else
const i8vec2 v0 = unpack8(int32_t(data_a_packed16[a_offset + ib].qs[iqs/2])).xy; // vec4 used due to #12147
const i8vec2 v1 = unpack8(int32_t(data_a_packed16[a_offset + ib].qs[iqs/2 + 1])).xy;
#endif
return vec4(v0.x, v0.y, v1.x, v1.y);
}
#endif
Expand Down
28 changes: 28 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,25 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,

const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;

#if defined(ADRENO)
const vec4 scale_0_4_l_f = vec4(
float((scale_0_4_l >> 0) & 0x3Fu),
float((scale_0_4_l >> 8) & 0x3Fu),
float((scale_0_4_l >> 16) & 0x3Fu),
float((scale_0_4_l >> 24) & 0x3Fu)
);

const vec4 scale8_f = vec4(
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 0 & 0xFFu),
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 8 & 0xFFu),
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 16 & 0xFFu),
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 24 & 0xFFu)
);
#else
const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));
#endif

const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
Expand All @@ -44,10 +61,17 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;

#if defined(ADRENO)
const vec4 qs0_lo4 = vec4(float(qs0_u32_lo4 & 0xFFu), float((qs0_u32_lo4 >> 8) & 0xFFu), float((qs0_u32_lo4 >> 16) & 0xFFu), float((qs0_u32_lo4 >> 24) & 0xFFu));
const vec4 qs64_lo4 = vec4(float(qs64_u32_lo4 & 0xFFu), float((qs64_u32_lo4 >> 8) & 0xFFu), float((qs64_u32_lo4 >> 16) & 0xFFu), float((qs64_u32_lo4 >> 24) & 0xFFu));
const vec4 qs0_hi4 = vec4(float(qs0_u32_hi4 & 0xFFu), float((qs0_u32_hi4 >> 8) & 0xFFu), float((qs0_u32_hi4 >> 16) & 0xFFu), float((qs0_u32_hi4 >> 24) & 0xFFu));
const vec4 qs64_hi4 = vec4(float(qs64_u32_hi4 & 0xFFu), float((qs64_u32_hi4 >> 8) & 0xFFu), float((qs64_u32_hi4 >> 16) & 0xFFu), float((qs64_u32_hi4 >> 24) & 0xFFu));
#else
const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4));
const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4));
const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4));
const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4));
#endif

const FLOAT_TYPE q4_0 = qs0_lo4.x;
const FLOAT_TYPE q4_1 = qs0_lo4.y;
Expand All @@ -66,7 +90,11 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
const FLOAT_TYPE q4_14 = qs64_hi4.z;
const FLOAT_TYPE q4_15 = qs64_hi4.w;

#if defined(ADRENO)
for (uint j = 0; j < NUM_COLS; ++j) {
#else
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
#endif
vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]);
vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]);
vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]);
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,17 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
const uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;

#if defined(ADRENO)
const vec4 q0 = vec4(float(q0_u32 & 0xFF), float((q0_u32 >> 8) & 0xFF), float((q0_u32 >> 16) & 0xFF), float(q0_u32 >> 24)) - 32;
const vec4 q1 = vec4(float(q1_u32 & 0xFF), float((q1_u32 >> 8) & 0xFF), float((q1_u32 >> 16) & 0xFF), float(q1_u32 >> 24)) - 32;
const vec4 q2 = vec4(float(q2_u32 & 0xFF), float((q2_u32 >> 8) & 0xFF), float((q2_u32 >> 16) & 0xFF), float(q2_u32 >> 24)) - 32;
const vec4 q3 = vec4(float(q3_u32 & 0xFF), float((q3_u32 >> 8) & 0xFF), float((q3_u32 >> 16) & 0xFF), float(q3_u32 >> 24)) - 32;
#else
const vec4 q0 = vec4(unpack8(q0_u32)) - 32;
const vec4 q1 = vec4(unpack8(q1_u32)) - 32;
const vec4 q2 = vec4(unpack8(q2_u32)) - 32;
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
#endif

if (all_threads) {
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
Expand Down
43 changes: 43 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,25 @@ void main() {

const float d = float(data_a_packed16[ib].d);
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);

#if defined(ADRENO)
vec4 v0 = (vec4(
float((vui >> 0) & 0xF),
float((vui >> 8) & 0xF),
float((vui >> 16) & 0xF),
float((vui >> 24) & 0xF)
) - 8.0) * d;

vec4 v1 = (vec4(
float((vui >> 4) & 0xF),
float((vui >> 12) & 0xF),
float((vui >> 20) & 0xF),
float((vui >> 28) & 0xF)
) - 8.0) * d;
#else
const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
#endif

buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
Expand All @@ -384,8 +401,24 @@ void main() {
const float d = float(data_a_packed16[ib].d);
const float m = float(data_a_packed16[ib].m);
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);

#if defined(ADRENO)
vec4 v0 = vec4(
float((vui >> 0) & 0xF),
float((vui >> 8) & 0xF),
float((vui >> 16) & 0xF),
float((vui >> 24) & 0xF)
) * d + m;
vec4 v1 = vec4(
float((vui >> 4) & 0xF),
float((vui >> 12) & 0xF),
float((vui >> 20) & 0xF),
float((vui >> 28) & 0xF)
) * d + m;
#else
const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
#endif

buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
Expand Down Expand Up @@ -441,10 +474,20 @@ void main() {
const uint ib = idx / 8;
const uint iqs = idx & 0x07;

#if defined(ADRENO)
const float d = float(data_a[ib].d);
const vec4 v = vec4(
int(data_a[ib].qs[4*iqs]),
int(data_a[ib].qs[4*iqs + 1]),
int(data_a[ib].qs[4*iqs + 2]),
int(data_a[ib].qs[4*iqs + 3])
) * d;
#else
const float d = float(data_a_packed16[ib].d);
const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147
const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy;
const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
#endif

buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
Expand Down
20 changes: 20 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,20 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];

#include "mul_mmq_funcs.comp"

int dotPacked4x8(uint a, uint b) {
int a0 = int(a << 24) >> 24;
int a1 = int(a << 16) >> 24;
int a2 = int(a << 8) >> 24;
int a3 = int(a) >> 24;

int b0 = int(b << 24) >> 24;
int b1 = int(b << 16) >> 24;
int b2 = int(b << 8) >> 24;
int b3 = int(b) >> 24;

return a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3;
}

void main() {
#ifdef NEEDS_INIT_IQ_SHMEM
init_iq_shmem(gl_WorkGroupSize);
Expand Down Expand Up @@ -352,8 +366,14 @@ void main() {
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
int32_t q_sum = 0;
[[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {

#if defined(ADRENO)
q_sum += dotPacked4x8(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
cache_b_qs[cc * (BK / 4) + idx_k]);
#else
q_sum += dotPacked4x8EXT(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
cache_b_qs[cc * (BK / 4) + idx_k]);
#endif
}

sums[sums_idx] += mul_q8_1(q_sum, cache_a_dm[cache_a_idx], cache_b_ds[cc], 1);
Expand Down
56 changes: 33 additions & 23 deletions ggml/src/ggml-vulkan/vulkan-shaders/out_prod_q8_0.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
#include "generic_binary_head.comp"
#include "dequant_funcs.comp"

const uint num_threads = 256;
const uint quant_group_sz = 2;
const uint num_threads = 512 / quant_group_sz;
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;

void get_dst_indices(uint idx, out uint i20, out uint i21, out uint i22, out uint i23) {
Expand All @@ -17,38 +18,47 @@ void get_dst_indices(uint idx, out uint i20, out uint i21, out uint i22, out uin
}

void main() {
// num_threads * num_iter must equal 512 to match the wg_denoms and get_idx
const uint num_iter = 2;

const uint broadcast2 = uint(p.param2);
const uint broadcast3 = p.param3;

uint idx = get_idx();
uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x * quant_group_sz;

uint aoffset = get_aoffset();
uint boffset = get_boffset();
uint doffset = get_doffset();

[[unroll]] for (uint it = 0; it < num_iter; ++it) {
if (idx < p.ne) {
uint i0, i1, i2, i3;
get_dst_indices(idx, i0, i1, i2, i3);
if (idx < p.ne) {
uint i0, i1, i2, i3;
get_dst_indices(idx, i0, i1, i2, i3);

float acc = 0.0f;
vec2 acc = vec2(0.0);

for (uint k = 0; k < p.ne01; k++) {
if (i0 + 1 >= p.ne20) {
continue;
}

for (uint k = 0; k < p.ne01; k += 1) {
const uint a_block_base = get_aoffset() + (i3 / broadcast3) * p.nb03 + (i2 / broadcast2) * p.nb02 + k * p.nb01;
const uint ib = a_block_base + (i0 / QUANT_K);
const uint iqs = (i0 % QUANT_K) / QUANT_R;
const uint a_block_base = aoffset + (i3 / broadcast3) * p.nb03 + (i2 / broadcast2) * p.nb02 + k * p.nb01;
const uint ib = a_block_base + ((i0) / QUANT_K) * p.nb00;
const uint iqs = ((i0) % QUANT_K) / QUANT_R;

const vec2 v = dequantize(ib, iqs, 0);
const vec2 dm = get_dm(ib, 0);
const float a_val = v.x * dm.x + dm.y;
const vec2 v = dequantize(ib, iqs, 0);
const vec2 dm = get_dm(ib, 0);
const vec2 a_vals = v * dm.x + dm.y;

const uint b_idx = src1_idx(i1, k, i2, i3);
const float b = data_b[boffset + b_idx];

acc += a_vals * b;
}

const uint b_idx = src1_idx(i1, k, i2, i3);
const float b = data_b[get_boffset() + b_idx];
acc += a_val * b;
uint d_idx = dst_idx(i0, i1, i2, i3);
for (uint q = 0; q < quant_group_sz; q++) {
if (d_idx + q >= p.ne) {
continue;
}

uint d_idx = dst_idx(i0, i1, i2, i3);
data_d[get_doffset() + d_idx] = acc;
data_d[doffset + d_idx + q] = acc[q];
}
idx += num_threads;
}
}
18 changes: 18 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,28 @@ void quantize() {
vals = round(vals * d_inv);

#ifndef QBLOCK_X4
#if defined(ADRENO)
i8vec4 q = i8vec4(round(vals));
data_b[ib].qs[iqs] =
int((uint(q.x) & 0xFFu) |
((uint(q.y) & 0xFFu) << 8) |
((uint(q.z) & 0xFFu) << 16) |
((uint(q.w) & 0xFFu) << 24));
#else
data_b[ib].qs[iqs] = pack32(i8vec4(round(vals)));
#endif
#else
#if defined(ADRENO)
i8vec4 q = i8vec4(round(vals));
data_b[ibx4_outer].qs[ibx4_inner * 8 + iqs] =
int((uint(q.x) & 0xFFu) |
((uint(q.y) & 0xFFu) << 8) |
((uint(q.z) & 0xFFu) << 16) |
((uint(q.w) & 0xFFu) << 24));
#else
data_b[ibx4_outer].qs[ibx4_inner * 8 + iqs] = pack32(i8vec4(round(vals)));
#endif
#endif

#ifndef USE_SUBGROUPS
barrier();
Expand Down
Loading
Loading