Skip to content

Commit f763866

Browse files
committed
Merge branch 'master' into feature/gfx120X_targets
2 parents d768080 + 02082f1 commit f763866

File tree

152 files changed

+20601
-13646
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

152 files changed

+20601
-13646
lines changed

.github/workflows/build.yml

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,35 @@ jobs:
676676
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
677677
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
678678
679+
macOS-latest-cmake-visionos:
680+
runs-on: macos-latest
681+
682+
steps:
683+
- name: Clone
684+
id: checkout
685+
uses: actions/checkout@v4
686+
687+
- name: Dependencies
688+
id: depends
689+
continue-on-error: true
690+
run: |
691+
brew update
692+
693+
- name: Build
694+
id: cmake_build
695+
run: |
696+
sysctl -a
697+
cmake -B build -G Xcode \
698+
-DGGML_METAL_USE_BF16=ON \
699+
-DGGML_METAL_EMBED_LIBRARY=ON \
700+
-DLLAMA_BUILD_EXAMPLES=OFF \
701+
-DLLAMA_BUILD_TESTS=OFF \
702+
-DLLAMA_BUILD_SERVER=OFF \
703+
-DCMAKE_SYSTEM_NAME=visionOS \
704+
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
705+
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
706+
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
707+
679708
macOS-latest-swift:
680709
runs-on: macos-latest
681710

@@ -1379,7 +1408,7 @@ jobs:
13791408
id: pack_artifacts
13801409
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
13811410
run: |
1382-
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
1411+
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
13831412
13841413
- name: Upload artifacts
13851414
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

CMakeLists.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ else()
2929
set(LLAMA_STANDALONE OFF)
3030
endif()
3131

32+
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
33+
3234
if (EMSCRIPTEN)
3335
set(BUILD_SHARED_LIBS_DEFAULT OFF)
3436

@@ -145,7 +147,13 @@ endif()
145147
# 3rd-party
146148
#
147149

148-
if (NOT TARGET ggml)
150+
if (LLAMA_USE_SYSTEM_GGML)
151+
message(STATUS "Using system-provided libggml, skipping ggml build")
152+
find_package(ggml REQUIRED)
153+
add_library(ggml ALIAS ggml::ggml)
154+
endif()
155+
156+
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
149157
add_subdirectory(ggml)
150158
# ... otherwise assume ggml is added by a parent CMakeLists.txt
151159
endif()

build-xcframework.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \
432432
-DCMAKE_SYSTEM_NAME=visionOS \
433433
-DCMAKE_OSX_SYSROOT=xros \
434434
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
435-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
436-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
435+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
436+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
437437
-S .
438438
cmake --build build-visionos --config Release -- -quiet
439439

@@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
445445
-DCMAKE_SYSTEM_NAME=visionOS \
446446
-DCMAKE_OSX_SYSROOT=xrsimulator \
447447
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
448-
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
449-
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
448+
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
449+
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
450450
-S .
451451
cmake --build build-visionos-sim --config Release -- -quiet
452452

ci/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
2626
# with SYCL support
2727
source /opt/intel/oneapi/setvars.sh
2828
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
29+
30+
# with MUSA support
31+
GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
32+
```
33+
34+
## Running MUSA CI in a Docker Container
35+
36+
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
37+
38+
### 1. Create a local directory to store cached models, configuration files and venv:
39+
40+
```bash
41+
mkdir -p $HOME/llama.cpp/ci-cache
42+
```
43+
44+
### 2. Create a local directory to store CI run results:
45+
46+
```bash
47+
mkdir -p $HOME/llama.cpp/ci-results
48+
```
49+
50+
### 3. Start a Docker container and run the CI:
51+
52+
```bash
53+
docker run --privileged -it \
54+
-v $HOME/llama.cpp/ci-cache:/ci-cache \
55+
-v $HOME/llama.cpp/ci-results:/ci-results \
56+
-v $PWD:/ws -w /ws \
57+
mthreads/musa:rc3.1.1-devel-ubuntu22.04
2958
```
59+
60+
Inside the container, execute the following commands:
61+
62+
```bash
63+
apt update -y && apt install -y bc cmake git python3.10-venv time unzip wget
64+
git config --global --add safe.directory /ws
65+
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
66+
```
67+
68+
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

ci/run.sh

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
# # with VULKAN support
1717
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
1818
#
19+
# # with MUSA support
20+
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
21+
#
1922

2023
if [ -z "$2" ]; then
2124
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -52,13 +55,22 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
5255
echo "source /opt/intel/oneapi/setvars.sh"
5356
exit 1
5457
fi
55-
58+
# Use only main GPU
59+
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
60+
# Enable sysman for correct memory reporting
61+
export ZES_ENABLE_SYSMAN=1
5662
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
5763
fi
5864

5965
if [ ! -z ${GG_BUILD_VULKAN} ]; then
6066
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
6167
fi
68+
69+
if [ ! -z ${GG_BUILD_MUSA} ]; then
70+
# Use qy1 by default (MTT S80)
71+
MUSA_ARCH=${MUSA_ARCH:-21}
72+
CMAKE_EXTRA="-DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
73+
fi
6274
## helpers
6375

6476
# download a file if it does not exist or if it is outdated
@@ -808,7 +820,7 @@ export LLAMA_LOG_PREFIX=1
808820
export LLAMA_LOG_TIMESTAMPS=1
809821

810822
if [ -z ${GG_BUILD_LOW_PERF} ]; then
811-
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
823+
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
812824
rm -rf ${SRC}/models-mnt
813825
mnt_models=${MNT}/models
814826
mkdir -p ${mnt_models}
@@ -826,16 +838,20 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
826838
fi
827839

828840
ret=0
829-
830-
test $ret -eq 0 && gg_run ctest_debug
841+
if [ -z ${GG_BUILD_SYCL} ]; then
842+
# SYCL build breaks with debug build flags
843+
test $ret -eq 0 && gg_run ctest_debug
844+
fi
831845
test $ret -eq 0 && gg_run ctest_release
832846

833847
if [ -z ${GG_BUILD_LOW_PERF} ]; then
834848
test $ret -eq 0 && gg_run embd_bge_small
835849
test $ret -eq 0 && gg_run rerank_tiny
836850

837851
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
838-
test $ret -eq 0 && gg_run test_scripts_debug
852+
if [ -z ${GG_BUILD_SYCL} ]; then
853+
test $ret -eq 0 && gg_run test_scripts_debug
854+
fi
839855
test $ret -eq 0 && gg_run test_scripts_release
840856
fi
841857

@@ -846,7 +862,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
846862
test $ret -eq 0 && gg_run pythia_2_8b
847863
#test $ret -eq 0 && gg_run open_llama_7b_v2
848864
fi
849-
test $ret -eq 0 && gg_run ctest_with_model_debug
865+
if [ -z ${GG_BUILD_SYCL} ]; then
866+
test $ret -eq 0 && gg_run ctest_with_model_debug
867+
fi
850868
test $ret -eq 0 && gg_run ctest_with_model_release
851869
fi
852870
fi

cmake/common.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
include("ggml/cmake/common.cmake")
2+
13
function(llama_add_compile_flags)
24
if (LLAMA_FATAL_WARNINGS)
35
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")

common/arg.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
764764
).set_env("LLAMA_ARG_CTX_SIZE"));
765765
add_opt(common_arg(
766766
{"-n", "--predict", "--n-predict"}, "N",
767-
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
767+
string_format(
768+
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
769+
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
770+
: "number of tokens to predict (default: %d, -1 = infinity)",
771+
params.n_predict),
768772
[](common_params & params, int value) {
769773
params.n_predict = value;
770774
}
@@ -849,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
849853
}
850854
}
851855
).set_excludes({LLAMA_EXAMPLE_SERVER}));
856+
add_opt(common_arg(
857+
{"-sysf", "--system-prompt-file"}, "FNAME",
858+
"a file containing the system prompt (default: none)",
859+
[](common_params & params, const std::string & value) {
860+
std::ifstream file(value);
861+
if (!file) {
862+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
863+
}
864+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
865+
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
866+
params.system_prompt.pop_back();
867+
}
868+
}
869+
).set_examples({LLAMA_EXAMPLE_MAIN}));
852870
add_opt(common_arg(
853871
{"--in-file"}, "FNAME",
854872
"an input file (repeat to specify multiple files)",
@@ -1871,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18711889
[](common_params & params, const std::string & value) {
18721890
params.out_file = value;
18731891
}
1874-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1892+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
18751893
add_opt(common_arg(
18761894
{"-ofreq", "--output-frequency"}, "N",
18771895
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),

common/common.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -955,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) {
955955
return iparams;
956956
}
957957

958-
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
959-
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
958+
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
959+
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
960960
params.ctx_shift = false;
961961
}
962962

@@ -1033,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) {
10331033
if (params.warmup) {
10341034
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
10351035

1036+
llama_set_warmup(lctx, true);
1037+
10361038
std::vector<llama_token> tmp;
10371039
llama_token bos = llama_vocab_bos(vocab);
10381040
llama_token eos = llama_vocab_eos(vocab);
@@ -1060,9 +1062,10 @@ struct common_init_result common_init_from_params(common_params & params) {
10601062
if (llama_model_has_decoder(model)) {
10611063
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
10621064
}
1063-
llama_kv_cache_clear(lctx);
1065+
llama_kv_self_clear(lctx);
10641066
llama_synchronize(lctx);
10651067
llama_perf_context_reset(lctx);
1068+
llama_set_warmup(lctx, false);
10661069
}
10671070

10681071
iparams.model.reset(model);

common/speculative.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
173173
result.reserve(params.n_draft);
174174

175175
if (reuse_n == 0) {
176-
llama_kv_cache_clear(ctx);
176+
llama_kv_self_clear(ctx);
177177

178178
prompt.clear();
179179
} else {
@@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
192192
}
193193

194194
if (reuse_i > 0) {
195-
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
196-
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
195+
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196+
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197197

198198
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199199
}
200200

201201
if (reuse_n < (int) prompt.size()) {
202-
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
202+
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
203203

204204
prompt.erase(prompt.begin() + reuse_n, prompt.end());
205205
}

0 commit comments

Comments
 (0)