Skip to content

Commit 3a56148

Browse files
committed
Merge branch 'ggml-org-master'
2 parents 985b5fc + 7be0eec commit 3a56148

File tree

8 files changed

+272
-226
lines changed

8 files changed

+272
-226
lines changed

ggml/CMakeLists.txt

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,41 @@
11
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
22
project("ggml" C CXX ASM)
3+
4+
### GGML Version
5+
set(GGML_VERSION_MAJOR 0)
6+
set(GGML_VERSION_MINOR 9)
7+
set(GGML_VERSION_PATCH 0)
8+
set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
9+
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
10+
11+
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
12+
if(GIT_EXE)
13+
# Get current git commit hash
14+
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
15+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
16+
OUTPUT_VARIABLE GGML_BUILD_COMMIT
17+
OUTPUT_STRIP_TRAILING_WHITESPACE
18+
ERROR_QUIET
19+
)
20+
21+
# Check if the working directory is dirty (i.e., has uncommitted changes)
22+
execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
23+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
24+
RESULT_VARIABLE GGML_GIT_DIRTY
25+
ERROR_QUIET
26+
)
27+
endif()
28+
29+
# Build the version string with optional -dev suffix and dirty flag
30+
set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
31+
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
32+
set(GGML_VERSION "${GGML_VERSION}-dirty")
33+
endif()
34+
35+
if(NOT GGML_BUILD_COMMIT)
36+
set(GGML_BUILD_COMMIT "unknown")
37+
endif()
38+
339
include(CheckIncludeFileCXX)
440

541
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -300,26 +336,6 @@ endif()
300336
# Create CMake package
301337
#
302338

303-
# Generate version info based on git commit.
304-
305-
if(NOT DEFINED GGML_BUILD_NUMBER)
306-
find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
307-
execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
308-
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
309-
OUTPUT_VARIABLE GGML_BUILD_NUMBER
310-
OUTPUT_STRIP_TRAILING_WHITESPACE
311-
)
312-
313-
if(GGML_BUILD_NUMBER EQUAL 1)
314-
message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
315-
endif()
316-
317-
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
318-
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
319-
OUTPUT_VARIABLE GGML_BUILD_COMMIT
320-
OUTPUT_STRIP_TRAILING_WHITESPACE
321-
)
322-
endif()
323339

324340

325341
# Capture variables prefixed with GGML_.
@@ -348,7 +364,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
348364

349365
# Create the CMake package and set install location.
350366

351-
set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
367+
set(GGML_INSTALL_VERSION ${GGML_VERSION})
352368
set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
353369
set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
354370
set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,14 @@ if (CUDAToolkit_FOUND)
3333
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
3434
# FP16 support (Pascal and newer)
3535
list(APPEND ARCH_LIST "60-virtual")
36-
else()
37-
# Maxwell and newer
38-
list(APPEND ARCH_LIST "50-virtual")
36+
endif()
37+
38+
if (CUDAToolkit_VERSION VERSION_LESS "13")
39+
list(APPEND ARCH_LIST 50-virtual 61-virtual 70-virtual)
3940
endif()
4041

4142
# Always included after base architecture assuming CUDA toolkit version is 11.1 or higher
42-
list(APPEND ARCH_LIST "61-virtual" "70-virtual" "75-virtual" "80-virtual" "86-real")
43+
list(APPEND ARCH_LIST "75-virtual" "80-virtual" "86-real")
4344

4445
# Version-dependent architectures for newer GPUs
4546
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1185,6 +1185,14 @@ struct vk_staging_memcpy {
11851185
size_t n;
11861186
};
11871187

1188+
struct vk_staging_memset {
1189+
vk_staging_memset(void * _dst, uint32_t _val, size_t _n) : dst(_dst), val(_val), n(_n) {}
1190+
1191+
void * dst;
1192+
uint32_t val;
1193+
size_t n;
1194+
};
1195+
11881196
struct vk_context_struct {
11891197
vk_submission * s;
11901198
std::vector<vk_sequence> seqs;
@@ -1193,6 +1201,7 @@ struct vk_context_struct {
11931201

11941202
std::vector<vk_staging_memcpy> in_memcpys;
11951203
std::vector<vk_staging_memcpy> out_memcpys;
1204+
std::vector<vk_staging_memset> memsets;
11961205

11971206
vk_command_pool * p {};
11981207
};
@@ -1584,7 +1593,9 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
15841593
}
15851594

15861595
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
1587-
vk::PipelineCreateFlags{},
1596+
device->pipeline_executable_properties_support ?
1597+
vk::PipelineCreateFlagBits::eCaptureStatisticsKHR :
1598+
vk::PipelineCreateFlags{},
15881599
pipeline_shader_create_info,
15891600
pipeline->layout);
15901601

@@ -5194,6 +5205,14 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
51945205
}
51955206
}
51965207

5208+
static void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets = nullptr) {
5209+
if (memsets == nullptr) {
5210+
memset(dst, val, size);
5211+
} else {
5212+
memsets->emplace_back(dst, val, size);
5213+
}
5214+
}
5215+
51975216
static void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
51985217
if (device->sync_staging == nullptr || device->sync_staging->size < size) {
51995218
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
@@ -5389,6 +5408,10 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
53895408
memcpy(cpy.dst, cpy.src, cpy.n);
53905409
}
53915410

5411+
for (auto& mset : subctx->memsets) {
5412+
memset(mset.dst, mset.val, mset.n);
5413+
}
5414+
53925415
ggml_vk_submit(subctx, dst->device->fence);
53935416
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
53945417
dst->device->device.resetFences({ dst->device->fence });
@@ -5528,12 +5551,25 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
55285551
static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
55295552
VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")");
55305553

5554+
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
5555+
dst->device->uma) {
5556+
deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets);
5557+
return;
5558+
}
5559+
5560+
// Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
55315561
ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
55325562
}
55335563

55345564
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
55355565
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
55365566

5567+
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
5568+
dst->device->uma) {
5569+
memset((uint8_t*)dst->ptr + offset, c, size);
5570+
return;
5571+
}
5572+
55375573
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
55385574
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
55395575
ggml_vk_ctx_begin(dst->device, subctx);
@@ -11168,6 +11204,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1116811204
memcpy(cpy.dst, cpy.src, cpy.n);
1116911205
}
1117011206

11207+
for (auto& mset : subctx->memsets) {
11208+
memset(mset.dst, mset.val, mset.n);
11209+
}
11210+
1117111211
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
1117211212
ggml_vk_submit(subctx, ctx->almost_ready_fence);
1117311213
ctx->almost_ready_fence_pending = true;
@@ -11190,6 +11230,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1119011230
}
1119111231
subctx->in_memcpys.clear();
1119211232
subctx->out_memcpys.clear();
11233+
subctx->memsets.clear();
1119311234
}
1119411235

1119511236
return true;

ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@
3131
#include "types.comp"
3232

3333
#ifndef LOAD_VEC_A
34-
#define LOAD_VEC_A 1
34+
#define LOAD_VEC_A 2
3535
#endif
3636
#ifndef LOAD_VEC_B
37-
#define LOAD_VEC_B 1
37+
#define LOAD_VEC_B 2
3838
#endif
3939

4040
#if !defined(TO_FLOAT_TYPE)
@@ -98,13 +98,13 @@ layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat
9898
layout (constant_id = 10) const uint WARP = 32;
9999

100100
#ifdef COOPMAT
101-
#define SHMEM_STRIDE (BK + 8)
101+
#define SHMEM_STRIDE (BK / 2 + 4)
102102
#else
103-
#define SHMEM_STRIDE (BK + 1)
103+
#define SHMEM_STRIDE (BK / 2 + 1)
104104
#endif
105105

106-
shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE];
107-
shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];
106+
shared FLOAT_TYPE_VEC2 buf_a[BM * SHMEM_STRIDE];
107+
shared FLOAT_TYPE_VEC2 buf_b[BN * SHMEM_STRIDE];
108108

109109
#define NUM_WARPS (BLOCK_SIZE / WARP)
110110

@@ -302,8 +302,8 @@ void main() {
302302
}
303303
#else
304304
ACC_TYPE sums[WMITER * TM * WNITER * TN];
305-
FLOAT_TYPE cache_a[WMITER * TM];
306-
FLOAT_TYPE cache_b[TN];
305+
FLOAT_TYPE_VEC2 cache_a[WMITER * TM];
306+
FLOAT_TYPE_VEC2 cache_b[TN];
307307

308308
[[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
309309
sums[i] = ACC_TYPE(0.0f);
@@ -312,13 +312,13 @@ void main() {
312312

313313
for (uint block = start_k; block < end_k; block += BK) {
314314
[[unroll]] for (uint l = 0; l < BM; l += loadstride_a) {
315-
load_a_to_shmem(pos_a, loadr_a, loadc_a + l, ir * BM + loadc_a + l, block + loadr_a, end_k);
315+
load_a_to_shmem(pos_a, loadr_a, loadc_a + l, ir * BM + loadc_a + l, block, end_k);
316316
}
317317
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
318318
#if !defined(MUL_MAT_ID)
319-
load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic * BN + loadc_b + l, block + loadr_b, end_k);
319+
load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic * BN + loadc_b + l, block, end_k);
320320
#else
321-
load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic, _ne1, block + loadr_b, end_k);
321+
load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic, _ne1, block, end_k);
322322
#endif
323323
}
324324

@@ -331,17 +331,17 @@ void main() {
331331
[[unroll]] for (uint i = 0; i < BK; i += TK) {
332332
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
333333
// Load from shared into cache
334-
coopMatLoad(cache_a, buf_a, (warp_r * WM + cm_row * TM) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor);
334+
coopMatLoad(cache_a, buf_a, (warp_r * WM + cm_row * TM) * SHMEM_STRIDE + i / 2, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor);
335335

336336
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
337-
coopMatLoad(cache_b, buf_b, (warp_c * WN + cm_col * TN) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor);
337+
coopMatLoad(cache_b, buf_b, (warp_c * WN + cm_col * TN) * SHMEM_STRIDE + i / 2, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor);
338338

339339
sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a, cache_b, sums[cm_col * cms_per_row + cm_row]);
340340
}
341341
}
342342
}
343343
#else
344-
[[unroll]] for (uint i = 0; i < BK; i++) {
344+
[[unroll]] for (uint i = 0; i < BK / 2; i++) {
345345
// Load from shared into cache
346346
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
347347
[[unroll]] for (uint j = 0; j < TM; j++) {
@@ -357,7 +357,7 @@ void main() {
357357
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
358358
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
359359
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
360-
sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[cc]), sums[sums_idx]);
360+
sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr].x), ACC_TYPE(cache_b[cc].x), fma(ACC_TYPE(cache_a[wsir * TM + cr].y), ACC_TYPE(cache_b[cc].y), sums[sums_idx]));
361361
}
362362
}
363363
}

0 commit comments

Comments
 (0)