Skip to content

Commit 3db975e

Browse files
committed
Refactor and optimize various components in the ggml library
- Removed duplicate enum entries in ggml-metal and ggml.c files. - Enhanced kernel functions in ggml-metal.metal for better performance and clarity. - Streamlined SYCL element-wise operations in element_wise.cpp, consolidating redundant code. - Cleaned up Vulkan CMake configuration and source files, eliminating unnecessary lines. - Improved llama batch processing logic in llama-batch.cpp and llama-batch.h for better efficiency. - Simplified memory management in llama-memory.h and llama-kv-cache-unified.cpp. - Removed outdated comments and redundant code across multiple files for clarity. - Adjusted server task handling in server.cpp to improve batch processing and error handling.
2 parents 2f01628 + a0374a6 commit 3db975e

27 files changed

+167
-422
lines changed

common/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ llama_add_compile_flags()
77
# Build info header
88
#
99

10-
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
11-
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
1210
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
1311
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
1412

ggml/src/ggml-cann/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
#include <unistd.h>
3939
#include <functional>
4040
#include <optional>
41-
#include <optional>
4241

4342
#include "../include/ggml-cann.h"
4443
#include "../include/ggml.h"

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
465465
endif()
466466

467467
if (GGML_VXE)
468-
message(STATUS "VX/VXE/VXE2 enabled")
469468
message(STATUS "VX/VXE/VXE2 enabled")
470469
list(APPEND ARCH_FLAGS -mvx -mzvector)
471470
list(APPEND ARCH_DEFINITIONS GGML_VXE)
@@ -481,8 +480,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
481480
else()
482481
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
483482
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
484-
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
485-
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
486483
endif()
487484

488485
if (GGML_CPU_REPACK)

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@
6363
#define NOINLINE __attribute__((__noinline__))
6464
#endif
6565

66-
#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
6766
#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
6867
#define VECTOR_REGISTERS 32
6968
#else

ggml/src/ggml-cuda/common.cuh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ static bool fp16_mma_available(const int cc) {
255255
return false;
256256
#else
257257
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
258-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
258+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
259+
GGML_CUDA_CC_IS_MTHREADS(cc)) {
259260
return true;
260261
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
261262
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
@@ -276,6 +277,14 @@ static bool fp16_mma_hardware_available(const int cc) {
276277
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
277278
}
278279

280+
static bool bf16_mma_hardware_available(const int cc) {
281+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
282+
}
283+
284+
static bool fp32_mma_hardware_available(const int cc) {
285+
return GGML_CUDA_CC_IS_CDNA(cc);
286+
}
287+
279288
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
280289
static bool new_mma_available(const int cc) {
281290
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;

ggml/src/ggml-cuda/mmv.cu

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#include "common.cuh"
33
#include "mmv.cuh"
44

5-
template <typename T, typename type_acc, int ncols_dst, int block_size>
65
template <typename T, typename type_acc, int ncols_dst, int block_size>
76
static __global__ void mul_mat_vec(
87
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
@@ -16,25 +15,10 @@ static __global__ void mul_mat_vec(
1615
const int sample_dst = blockIdx.z;
1716
const int sample_x = sample_dst / sample_ratio;
1817
const int sample_y = sample_dst;
19-
const int tid = threadIdx.x;
20-
21-
const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
22-
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
23-
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
24-
const int row = blockIdx.x;
25-
const int channel_dst = blockIdx.y;
26-
const int channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio;
27-
const int channel_y = ids ? channel_dst % nchannels_y : channel_dst;
28-
const int sample_dst = blockIdx.z;
29-
const int sample_x = sample_dst / sample_ratio;
30-
const int sample_y = sample_dst;
3118
const int tid = threadIdx.x;
3219

3320
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
3421

35-
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
36-
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
37-
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
3822
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
3923
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
4024
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,6 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
534534
GGML_METAL_KERNEL_TYPE_GEGLU_QUICK,
535535
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
536536
GGML_METAL_KERNEL_TYPE_MEAN,
537-
GGML_METAL_KERNEL_TYPE_MEAN,
538537
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
539538
GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
540539
GGML_METAL_KERNEL_TYPE_ARGMAX,

ggml/src/ggml-metal/ggml-metal.metal

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,7 +1345,14 @@ kernel void kernel_sum_rows(
13451345
shmem_f32[sgitg] = sumf;
13461346
}
13471347

1348-
dst_row[0] = row_sum;
1348+
threadgroup_barrier(mem_flags::mem_threadgroup);
1349+
1350+
sumf = shmem_f32[tiisg];
1351+
sumf = simd_sum(sumf);
1352+
1353+
if (tpitg.x == 0) {
1354+
dst_row[0] = norm ? sumf / args.ne00 : sumf;
1355+
}
13491356
}
13501357

13511358
typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
@@ -1464,18 +1471,14 @@ kernel void kernel_soft_max_4(
14641471
uint3 tpitg[[thread_position_in_threadgroup]],
14651472
uint sgitg[[simdgroup_index_in_threadgroup]],
14661473
uint tiisg[[thread_index_in_simdgroup]],
1467-
uint3 tptg[[threads_per_threadgroup]]) {
1468-
const int32_t i03 = tgpig.z;
1469-
const int32_t i02 = tgpig.y;
1470-
const int32_t i01 = tgpig.x;
1471-
1472-
const int32_t i13 = i03%args.ne13;
1473-
const int32_t i12 = i02%args.ne12;
1474-
const int32_t i11 = i01;
1474+
uint ntg[[threads_per_threadgroup]]) {
1475+
const int64_t i03 = (tgpig) / (args.ne02*args.ne01);
1476+
const int64_t i02 = (tgpig - i03*args.ne02*args.ne01) / args.ne01;
1477+
const int64_t i01 = (tgpig - i03*args.ne02*args.ne01 - i02*args.ne01);
14751478

1476-
device const float4 * psrc4 = (device const float4 *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
1477-
device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr;
1478-
device float4 * pdst4 = (device float4 *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3);
1479+
device const float4 * psrc4 = (device const float4 *) src0 + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00)/4;
1480+
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*args.ne00/4 : nullptr;
1481+
device float4 * pdst4 = (device float4 *) dst + (i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00)/4;
14791482

14801483
float slope = 1.0f;
14811484

@@ -4145,8 +4148,9 @@ kernel void kernel_flash_attn_ext(
41454148
ss[j*TS + 0] = S;
41464149
ss[j*TS + 1] = M;
41474150

4148-
ss[j*TS + 2*C + j - 1*SH] = ms0;
4149-
ss[j*TS + 2*C + j ] = ms1;
4151+
ss[j*TS + 2*C + j ] = ms0;
4152+
ss[j*TS + 2*C + j + sg*SH] = ms1;
4153+
}
41504154
}
41514155

41524156
//simdgroup_barrier(mem_flags::mem_threadgroup);
@@ -4175,7 +4179,9 @@ kernel void kernel_flash_attn_ext(
41754179
threadgroup_barrier(mem_flags::mem_threadgroup);
41764180
}
41774181

4178-
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
4182+
threadgroup_barrier(mem_flags::mem_threadgroup);
4183+
4184+
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*Q*DK);
41794185

41804186
// final rescale with 1/S and store to global memory
41814187
for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {

ggml/src/ggml-sycl/element_wise.cpp

Lines changed: 37 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -937,70 +937,45 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
937937
}
938938
}
939939

940-
static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
941-
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
942-
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
943-
const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
944-
sycl_parallel_for(stream,
945-
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
946-
sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
947-
[=](sycl::nd_item<1> item_ct1) {
948-
unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1);
949-
});
950-
});
951-
}
952-
953-
static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
954-
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
955-
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
956-
const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
957-
sycl_parallel_for(stream,
958-
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
959-
sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
960-
[=](sycl::nd_item<1> item_ct1) {
961-
unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
962-
});
963-
});
964-
}
965-
966-
static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
967-
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
968-
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
969-
const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
970-
sycl_parallel_for(stream,
971-
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
972-
sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
973-
[=](sycl::nd_item<1> item_ct1) {
974-
unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1);
975-
});
976-
});
977-
}
978-
979-
static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
980-
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
981-
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
982-
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
983-
sycl_parallel_for(stream,
984-
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
985-
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
986-
[=](sycl::nd_item<1> item_ct1) {
987-
unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1);
988-
});
989-
});
940+
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
941+
#if defined (GGML_SYCL_F16)
942+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
943+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
944+
#else
945+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
946+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
947+
#endif
948+
GGML_ASSERT(dst->src[0]->type == dst->type);
949+
dpct::queue_ptr main_stream = ctx.stream();
950+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
951+
switch (dst->type) {
952+
#if defined (GGML_SYCL_F16)
953+
case GGML_TYPE_F16:
954+
{
955+
auto data_pts = cast_data<sycl::half>(dst);
956+
neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
957+
break;
958+
}
959+
#endif
960+
case GGML_TYPE_F32:
961+
{
962+
auto data_pts = cast_data<float>(dst);
963+
neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
964+
break;
965+
}
966+
default:
967+
GGML_ABORT("GGML tensor type not supported!\n");
968+
}
990969
}
991970

992-
static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
993-
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
994-
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
995-
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
996-
sycl_parallel_for(stream,
997-
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
998-
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
999-
[=](sycl::nd_item<1> item_ct1) {
1000-
unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1);
1001-
});
1002-
});
1003-
}
971+
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
972+
#if defined (GGML_SYCL_F16)
973+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
974+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
975+
#else
976+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
977+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
978+
#endif
1004979

1005980
static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1006981
float negative_slope;

ggml/src/ggml-vulkan/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ if (Vulkan_FOUND)
5050
)
5151

5252
set(VULKAN_SHADER_GEN_CMAKE_ARGS "")
53-
set(VULKAN_SHADER_GEN_CMAKE_ARGS "")
5453

5554
# Test all shader extensions
5655
test_shader_extension_support(
@@ -178,7 +177,6 @@ if (Vulkan_FOUND)
178177
add_custom_command(
179178
OUTPUT ${_ggml_vk_header}
180179
${_ggml_vk_source}
181-
${_ggml_vk_source}
182180

183181
COMMAND ${_ggml_vk_genshaders_cmd}
184182
--glslc ${Vulkan_GLSLC_EXECUTABLE}

0 commit comments

Comments
 (0)