Skip to content

Commit fb83394

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents ccfbaec + 6609507 commit fb83394

File tree

19 files changed

+704
-236
lines changed

19 files changed

+704
-236
lines changed

.github/workflows/build.yml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ jobs:
664664
./build-xcframework.sh
665665
666666
windows-msys2:
667-
runs-on: windows-latest
667+
runs-on: windows-2025
668668

669669
strategy:
670670
fail-fast: false
@@ -714,7 +714,7 @@ jobs:
714714
cmake --build build --config ${{ matrix.build }} -j $(nproc)
715715
716716
windows-latest-cmake:
717-
runs-on: windows-latest
717+
runs-on: windows-2025
718718

719719
env:
720720
OPENBLAS_VERSION: 0.3.23
@@ -725,16 +725,22 @@ jobs:
725725
matrix:
726726
include:
727727
- build: 'cpu-x64 (static)'
728+
arch: 'x64'
728729
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
729730
- build: 'openblas-x64'
731+
arch: 'x64'
730732
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
731733
- build: 'vulkan-x64'
734+
arch: 'x64'
732735
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
733736
- build: 'llvm-arm64'
737+
arch: 'arm64'
734738
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
735739
- build: 'llvm-arm64-opencl-adreno'
740+
arch: 'arm64'
736741
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
737742
# - build: 'kompute-x64'
743+
# arch: 'x64'
738744
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
739745

740746
steps:
@@ -805,6 +811,8 @@ jobs:
805811
- name: libCURL
806812
id: get_libcurl
807813
uses: ./.github/actions/windows-setup-curl
814+
with:
815+
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
808816

809817
- name: Build
810818
id: cmake_build
@@ -825,7 +833,7 @@ jobs:
825833
826834
- name: Test
827835
id: cmake_test
828-
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
836+
if: ${{ matrix.arch == 'x64' }}
829837
run: |
830838
cd build
831839
ctest -L main -C Release --verbose --timeout 900
@@ -930,7 +938,7 @@ jobs:
930938
cmake --build build --config Release
931939
932940
windows-latest-cmake-sycl:
933-
runs-on: windows-latest
941+
runs-on: windows-2022
934942

935943
defaults:
936944
run:
@@ -964,7 +972,7 @@ jobs:
964972

965973
windows-latest-cmake-hip:
966974
if: ${{ github.event.inputs.create_release != 'true' }}
967-
runs-on: windows-latest
975+
runs-on: windows-2022
968976

969977
steps:
970978
- name: Clone

.github/workflows/release.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ jobs:
235235
name: llama-bin-ubuntu-vulkan-x64.zip
236236

237237
windows-cpu:
238-
runs-on: windows-latest
238+
runs-on: windows-2025
239239

240240
strategy:
241241
matrix:
@@ -271,7 +271,7 @@ jobs:
271271
env:
272272
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
273273
run: |
274-
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
274+
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
275275
cmake -S . -B build -G "Ninja Multi-Config" ^
276276
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
277277
-DGGML_NATIVE=OFF ^
@@ -288,7 +288,7 @@ jobs:
288288
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
289289
run: |
290290
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
291-
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
291+
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
292292
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
293293
294294
- name: Upload artifacts
@@ -298,7 +298,7 @@ jobs:
298298
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
299299

300300
windows:
301-
runs-on: windows-latest
301+
runs-on: windows-2025
302302

303303
env:
304304
OPENBLAS_VERSION: 0.3.23
@@ -448,7 +448,7 @@ jobs:
448448
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
449449

450450
windows-sycl:
451-
runs-on: windows-latest
451+
runs-on: windows-2022
452452

453453
defaults:
454454
run:
@@ -520,7 +520,7 @@ jobs:
520520
name: llama-bin-win-sycl-x64.zip
521521

522522
windows-hip:
523-
runs-on: windows-latest
523+
runs-on: windows-2022
524524

525525
strategy:
526526
matrix:

convert_hf_to_gguf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,11 @@ def _create_vocab_sentencepiece(self):
936936
scores: list[float] = [-10000.0] * vocab_size
937937
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
938938

939-
for token_id in range(vocab_size):
939+
for token_id in range(tokenizer.vocab_size()):
940+
if token_id >= vocab_size:
941+
logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
942+
break
943+
940944
piece = tokenizer.IdToPiece(token_id)
941945
text = piece.encode("utf-8")
942946
score = tokenizer.GetScore(token_id)
@@ -951,10 +955,6 @@ def _create_vocab_sentencepiece(self):
951955
elif tokenizer.IsByte(token_id):
952956
toktype = SentencePieceTokenTypes.BYTE
953957

954-
if token_id >= vocab_size:
955-
logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
956-
break
957-
958958
tokens[token_id] = text
959959
scores[token_id] = score
960960
toktypes[token_id] = toktype

examples/eval-callback/eval-callback.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
5555
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
5656
} else if (type == GGML_TYPE_F32) {
5757
v = *(float *) &data[i];
58+
} else if (type == GGML_TYPE_I64) {
59+
v = (float) *(int64_t *) &data[i];
5860
} else if (type == GGML_TYPE_I32) {
5961
v = (float) *(int32_t *) &data[i];
6062
} else if (type == GGML_TYPE_I16) {

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ extern "C" {
134134

135135
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
136136

137+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137138
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
138139
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
139140
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

ggml/include/ggml.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ extern "C" {
470470
GGML_OP_TRANSPOSE,
471471
GGML_OP_GET_ROWS,
472472
GGML_OP_GET_ROWS_BACK,
473+
GGML_OP_SET_ROWS,
473474
GGML_OP_DIAG,
474475
GGML_OP_DIAG_MASK_INF,
475476
GGML_OP_DIAG_MASK_ZERO,
@@ -687,6 +688,9 @@ extern "C" {
687688
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
688689
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
689690

691+
// true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
692+
GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
693+
690694
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
691695
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
692696

@@ -1375,6 +1379,23 @@ extern "C" {
13751379
struct ggml_tensor * b, // row indices
13761380
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
13771381

1382+
// a TD [n_embd, ne1, ne2, ne3]
1383+
// b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1384+
// c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1385+
//
1386+
// undefined behavior if destination rows overlap
1387+
//
1388+
// broadcast:
1389+
// ne2 % ne11 == 0
1390+
// ne3 % ne12 == 0
1391+
//
1392+
// return view(a)
1393+
GGML_API struct ggml_tensor * ggml_set_rows(
1394+
struct ggml_context * ctx,
1395+
struct ggml_tensor * a, // destination
1396+
struct ggml_tensor * b, // source
1397+
struct ggml_tensor * c); // row indices
1398+
13781399
GGML_API struct ggml_tensor * ggml_diag(
13791400
struct ggml_context * ctx,
13801401
struct ggml_tensor * a);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ typedef pthread_t ggml_thread_t;
195195

196196
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
197197
[GGML_TYPE_F32] = {
198+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
198199
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
199200
.vec_dot_type = GGML_TYPE_F32,
200201
.nrows = 1,
@@ -1817,6 +1818,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
18171818
{
18181819
ggml_compute_forward_get_rows_back(params, tensor);
18191820
} break;
1821+
case GGML_OP_SET_ROWS:
1822+
{
1823+
ggml_compute_forward_set_rows(params, tensor);
1824+
} break;
18201825
case GGML_OP_DIAG:
18211826
{
18221827
ggml_compute_forward_diag(params, tensor);
@@ -2170,6 +2175,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
21702175
n_tasks = n_threads;
21712176
} break;
21722177
case GGML_OP_GET_ROWS:
2178+
case GGML_OP_SET_ROWS:
21732179
{
21742180
// FIXME: get_rows can use additional threads, but the cost of launching additional threads
21752181
// decreases performance with GPU offloading
@@ -3124,6 +3130,10 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
31243130
return ggml_graph_compute(cgraph, &cplan);
31253131
}
31263132

3133+
void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
3134+
memcpy(y, x, n * sizeof(float));
3135+
}
3136+
31273137
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
31283138
int64_t i = 0;
31293139
#if defined(__F16C__)

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
416416

417417
switch (op->op) {
418418
case GGML_OP_CPY:
419+
case GGML_OP_SET_ROWS:
419420
return
420421
op->type != GGML_TYPE_IQ3_XXS &&
421422
op->type != GGML_TYPE_IQ3_S &&

ggml/src/ggml-cpu/ops.cpp

Lines changed: 77 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -696,24 +696,8 @@ static void ggml_compute_forward_dup_f32(
696696
if (ggml_is_contiguous(dst)) {
697697
// TODO: simplify
698698
if (nb00 == sizeof(float)) {
699-
if (dst->type == GGML_TYPE_F32) {
700-
size_t id = 0;
701-
const size_t rs = ne00 * nb00;
702-
char * dst_ptr = (char *) dst->data;
703-
704-
for (int i03 = 0; i03 < ne03; i03++) {
705-
for (int i02 = 0; i02 < ne02; i02++) {
706-
id += rs * ir0;
707-
for (int i01 = ir0; i01 < ir1; i01++) {
708-
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
709-
memcpy(dst_ptr + id, src0_ptr, rs);
710-
id += rs;
711-
}
712-
id += rs * (ne01 - ir1);
713-
}
714-
}
715-
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
716-
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
699+
if (ggml_get_type_traits_cpu(dst->type)->from_float) {
700+
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
717701

718702
size_t id = 0;
719703
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
@@ -724,7 +708,7 @@ static void ggml_compute_forward_dup_f32(
724708
id += rs * ir0;
725709
for (int i01 = ir0; i01 < ir1; i01++) {
726710
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
727-
quantize_row_q(src0_ptr, dst_ptr + id, ne00);
711+
from_float(src0_ptr, dst_ptr + id, ne00);
728712
id += rs;
729713
}
730714
id += rs * (ne01 - ir1);
@@ -2300,6 +2284,12 @@ void ggml_compute_forward_repeat(
23002284
{
23012285
ggml_compute_forward_repeat_f32(params, dst);
23022286
} break;
2287+
// TODO: templateify the implemenation and support for I64
2288+
// ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
2289+
//case GGML_TYPE_I64:
2290+
// {
2291+
// ggml_compute_forward_repeat_i64(params, dst);
2292+
// } break;
23032293
default:
23042294
{
23052295
GGML_ABORT("fatal error");
@@ -4470,6 +4460,74 @@ void ggml_compute_forward_get_rows(
44704460
//}
44714461
}
44724462

4463+
static void ggml_compute_forward_set_rows_f32(
4464+
const ggml_compute_params * params,
4465+
ggml_tensor * dst) {
4466+
4467+
const ggml_tensor * src0 = dst->src[0];
4468+
const ggml_tensor * src1 = dst->src[1];
4469+
4470+
GGML_TENSOR_BINARY_OP_LOCALS
4471+
4472+
const int64_t nc = ne00;
4473+
const int64_t nr = ne01;
4474+
4475+
assert(ne0 == nc);
4476+
assert(ne2 == ne02);
4477+
assert(ne3 == ne03);
4478+
assert(src0->type == GGML_TYPE_F32);
4479+
assert(ne02 % ne11 == 0);
4480+
assert(ne03 % ne12 == 0);
4481+
4482+
const int ith = params->ith;
4483+
const int nth = params->nth;
4484+
4485+
// rows per thread
4486+
const int64_t dr = (nr + nth - 1)/nth;
4487+
4488+
// row range for this thread
4489+
const int64_t ir0 = dr*ith;
4490+
const int64_t ir1 = std::min(ir0 + dr, nr);
4491+
4492+
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float;
4493+
4494+
for (int64_t i03 = 0; i03 < ne03; ++i03) {
4495+
for (int64_t i02 = 0; i02 < ne02; ++i02) {
4496+
for (int64_t i = ir0; i < ir1; ++i) {
4497+
const int64_t i12 = i03%ne12;
4498+
const int64_t i11 = i02%ne11;
4499+
const int64_t i10 = i;
4500+
4501+
const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
4502+
4503+
GGML_ASSERT(i1 >= 0 && i1 < ne1);
4504+
4505+
from_float(
4506+
(const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03),
4507+
((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc);
4508+
}
4509+
}
4510+
}
4511+
}
4512+
4513+
void ggml_compute_forward_set_rows(
4514+
const ggml_compute_params * params,
4515+
ggml_tensor * dst) {
4516+
4517+
const ggml_tensor * src0 = dst->src[0];
4518+
4519+
switch (src0->type) {
4520+
case GGML_TYPE_F32:
4521+
{
4522+
ggml_compute_forward_set_rows_f32(params, dst);
4523+
} break;
4524+
default:
4525+
{
4526+
GGML_ABORT("src0->type = %d (%s) not supported", src0->type, ggml_type_name(src0->type));
4527+
}
4528+
}
4529+
}
4530+
44734531
// ggml_compute_forward_get_rows_back
44744532

44754533
static void ggml_compute_forward_get_rows_back_f32_f16(

ggml/src/ggml-cpu/ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
5353
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
5454
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
5555
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
56+
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
5657
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
5758
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
5859
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);

0 commit comments

Comments
 (0)