Skip to content

Commit ed4ba31

Browse files
committed
Merge branch 'master' into imatrix
2 parents a4166a8 + ce82bd0 commit ed4ba31

File tree

20 files changed

+651
-230
lines changed

20 files changed

+651
-230
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
name: Build relocatable cmake package
2+
on:
3+
workflow_dispatch:
4+
workflow_call:
5+
6+
jobs:
7+
linux:
8+
runs-on: ubuntu-24.04
9+
steps:
10+
- uses: actions/checkout@v4
11+
with:
12+
fetch-depth: 0
13+
14+
- name: Install dependencies
15+
run: |
16+
sudo apt update
17+
sudo apt install -y build-essential tcl
18+
19+
- name: Build
20+
run: |
21+
PREFIX="$(pwd)"/inst
22+
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
23+
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
24+
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
25+
cmake --build build --config Release
26+
cmake --install build --prefix "$PREFIX" --config Release
27+
28+
export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
29+
tclsh <<'EOF'
30+
set build(commit) [string trim [exec git rev-parse --short HEAD]]
31+
set build(number) [string trim [exec git rev-list --count HEAD]]
32+
set build(version) "0.0.$build(number)"
33+
34+
set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
35+
set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \
36+
"set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
37+
"set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
38+
39+
puts -nonewline "Checking llama-config.cmake version... "
40+
foreach check $checks {
41+
if {![regexp -expanded -- $check $llamaconfig]} {
42+
puts "\"$check\" failed!"
43+
exit 1
44+
}
45+
}
46+
puts "success."
47+
EOF
48+
49+
cd examples/simple-cmake-pkg
50+
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
51+
cmake --build build

.github/workflows/build.yml

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,43 @@ on:
55
push:
66
branches:
77
- master
8-
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
8+
paths: [
9+
'.github/workflows/build.yml',
10+
'.github/workflows/build-linux-cross.yml',
11+
'.github/workflows/build-cmake-pkg.yml',
12+
'**/CMakeLists.txt',
13+
'**/.cmake',
14+
'**/*.h',
15+
'**/*.hpp',
16+
'**/*.c',
17+
'**/*.cpp',
18+
'**/*.cu',
19+
'**/*.cuh',
20+
'**/*.swift',
21+
'**/*.m',
22+
'**/*.metal',
23+
'**/*.comp'
24+
]
25+
926
pull_request:
1027
types: [opened, synchronize, reopened]
11-
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
28+
paths: [
29+
'.github/workflows/build.yml',
30+
'.github/workflows/build-linux-cross.yml',
31+
'.github/workflows/build-cmake-pkg.yml',
32+
'**/CMakeLists.txt',
33+
'**/.cmake',
34+
'**/*.h',
35+
'**/*.hpp',
36+
'**/*.c',
37+
'**/*.cpp',
38+
'**/*.cu',
39+
'**/*.cuh',
40+
'**/*.swift',
41+
'**/*.m',
42+
'**/*.metal',
43+
'**/*.comp'
44+
]
1245

1346
concurrency:
1447
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -478,6 +511,9 @@ jobs:
478511
build-linux-cross:
479512
uses: ./.github/workflows/build-linux-cross.yml
480513

514+
build-cmake-pkg:
515+
uses: ./.github/workflows/build-cmake-pkg.yml
516+
481517
macOS-latest-cmake-ios:
482518
runs-on: macos-latest
483519

@@ -683,7 +719,7 @@ jobs:
683719
env:
684720
OPENBLAS_VERSION: 0.3.23
685721
SDE_VERSION: 9.33.0-2024-01-07
686-
VULKAN_VERSION: 1.4.309.0
722+
VULKAN_VERSION: 1.4.313.2
687723

688724
strategy:
689725
matrix:
@@ -736,7 +772,7 @@ jobs:
736772
id: get_vulkan
737773
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
738774
run: |
739-
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
775+
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
740776
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
741777
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
742778
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

.github/workflows/release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ jobs:
302302

303303
env:
304304
OPENBLAS_VERSION: 0.3.23
305-
VULKAN_VERSION: 1.4.309.0
305+
VULKAN_VERSION: 1.4.313.2
306306

307307
strategy:
308308
matrix:
@@ -332,7 +332,7 @@ jobs:
332332
id: get_vulkan
333333
if: ${{ matrix.backend == 'vulkan' }}
334334
run: |
335-
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
335+
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
336336
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
337337
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
338338
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

examples/simple-chat/simple-chat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
9898
auto generate = [&](const std::string & prompt) {
9999
std::string response;
100100

101-
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == 0;
101+
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1;
102102

103103
// tokenize the prompt
104104
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);

ggml/src/ggml-cuda/common.cuh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,18 @@ static bool fp16_mma_available(const int cc) {
241241
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
242242
return false;
243243
#else
244-
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
245-
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
244+
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
245+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
246+
return true;
247+
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
248+
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
249+
return true;
250+
#else
251+
return false;
252+
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
253+
} else {
254+
return false;
255+
}
246256
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
247257
}
248258

@@ -252,6 +262,10 @@ static bool fp16_mma_hardware_available(const int cc) {
252262
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
253263
}
254264

265+
static bool bf16_mma_hardware_available(const int cc) {
266+
return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
267+
}
268+
255269
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
256270
static bool new_mma_available(const int cc) {
257271
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,7 @@ int ggml_cuda_get_device() {
100100
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
101101
ggml_cuda_set_device(device);
102102
cudaError_t err;
103-
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
104-
{
103+
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
105104
err = cudaMallocManaged(ptr, size);
106105
#if defined(GGML_USE_HIP)
107106
if (err == hipSuccess) {
@@ -119,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
119118
err = cudaMalloc(ptr, size);
120119
}
121120
#endif // defined(GGML_USE_HIP)
122-
}
123-
else
124-
{
121+
} else {
125122
err = cudaMalloc(ptr, size);
126123
}
127124
return err;
@@ -1946,16 +1943,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19461943
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
19471944

19481945
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1949-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1950-
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1946+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19511947
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19521948
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19531949
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
19541950
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
19551951
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19561952

1957-
bool any_gpus_with_slow_fp16 = false;
1958-
bool any_gpus_without_fp16_mma = false;
1953+
bool any_gpus_with_slow_fp16 = false;
19591954

19601955
if (split) {
19611956
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1966,16 +1961,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19661961
continue;
19671962
}
19681963

1969-
const int cc = ggml_cuda_info().devices[id].cc;
1970-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1971-
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1972-
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1964+
const int cc = ggml_cuda_info().devices[id].cc;
1965+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1966+
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
1967+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
19731968
}
19741969
} else {
1975-
const int cc = ggml_cuda_info().devices[ctx.device].cc;
1976-
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1977-
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1978-
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1970+
const int cc = ggml_cuda_info().devices[ctx.device].cc;
1971+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1972+
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
1973+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
19791974
}
19801975

19811976
// debug helpers
@@ -1986,7 +1981,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19861981
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
19871982
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
19881983

1989-
if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1984+
if (!split && use_mul_mat_vec) {
19901985
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
19911986
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
19921987
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);

0 commit comments

Comments
 (0)