Skip to content

Commit 08e90ac

Browse files
Merge branch 'ggml-org:master' into master
2 parents 52bead0 + 19e5943 commit 08e90ac

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+3779
-587
lines changed

.github/workflows/build.yml

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,69 @@ jobs:
135135
cd build
136136
ctest -L main --verbose --timeout 900
137137
138+
macOS-latest-cmake-arm64-webgpu:
139+
runs-on: macos-14
140+
141+
steps:
142+
- name: Clone
143+
id: checkout
144+
uses: actions/checkout@v4
145+
146+
- name: ccache
147+
uses: hendrikmuhs/[email protected]
148+
with:
149+
key: macOS-latest-cmake-arm64-webgpu
150+
evict-old-files: 1d
151+
152+
- name: Dependencies
153+
id: depends
154+
continue-on-error: true
155+
run: |
156+
brew update
157+
brew install curl
158+
159+
- name: Dawn Dependency
160+
id: dawn-depends
161+
run: |
162+
ARTIFACTS_JSON=$(curl -s -L \
163+
-H "Accept: application/vnd.github+json" \
164+
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
165+
-H "X-GitHub-Api-Version: 2022-11-28" \
166+
"https://api.github.com/repos/google/dawn/actions/artifacts")
167+
echo "Finding latest macos-latest-Release artifact..."
168+
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
169+
| sort_by(.created_at)
170+
| reverse
171+
| map(select(.name | test("macos-latest-Release$")))
172+
| .[0].archive_download_url')
173+
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
174+
echo "No suitable Dawn artifact found!"
175+
exit 1
176+
fi
177+
echo "Downloading from: $DOWNLOAD_URL"
178+
curl -L \
179+
-H "Accept: application/vnd.github+json" \
180+
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
181+
-o artifact.zip "$DOWNLOAD_URL"
182+
unzip artifact.zip
183+
mkdir dawn
184+
tar_file=$(find . -name '*.tar.gz' | head -n 1)
185+
echo "Extracting: $tar_file"
186+
tar -xvf "$tar_file" -C dawn --strip-components=1
187+
188+
- name: Build
189+
id: cmake_build
190+
run: |
191+
export CMAKE_PREFIX_PATH=dawn
192+
cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
193+
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
194+
195+
- name: Test
196+
id: cmake_test
197+
run: |
198+
cd build
199+
ctest -L main --verbose --timeout 900
200+
138201
ubuntu-cpu-cmake:
139202
strategy:
140203
matrix:
@@ -344,6 +407,72 @@ jobs:
344407
# This is using llvmpipe and runs slower than other backends
345408
ctest -L main --verbose --timeout 4200
346409
410+
ubuntu-22-cmake-webgpu:
411+
runs-on: ubuntu-22.04
412+
413+
steps:
414+
- name: Clone
415+
id: checkout
416+
uses: actions/checkout@v4
417+
418+
- name: ccache
419+
uses: hendrikmuhs/[email protected]
420+
with:
421+
key: ubuntu-22-cmake-webgpu
422+
evict-old-files: 1d
423+
424+
- name: Vulkan SDK Dependencies
425+
id: vulkan-depends
426+
run: |
427+
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
428+
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
429+
sudo apt-get update -y
430+
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
431+
432+
- name: Dawn Dependency
433+
id: dawn-depends
434+
run: |
435+
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
436+
ARTIFACTS_JSON=$(curl -s -L \
437+
-H "Accept: application/vnd.github+json" \
438+
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
439+
-H "X-GitHub-Api-Version: 2022-11-28" \
440+
"https://api.github.com/repos/google/dawn/actions/artifacts")
441+
echo "Finding latest ubuntu-latest-Release artifact..."
442+
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
443+
| sort_by(.created_at)
444+
| reverse
445+
| map(select(.name | test("ubuntu-latest-Release$")))
446+
| .[0].archive_download_url')
447+
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
448+
echo "No suitable Dawn artifact found!"
449+
exit 1
450+
fi
451+
echo "Downloading from: $DOWNLOAD_URL"
452+
curl -L \
453+
-H "Accept: application/vnd.github+json" \
454+
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
455+
-o artifact.zip "$DOWNLOAD_URL"
456+
unzip artifact.zip
457+
mkdir dawn
458+
tar_file=$(find . -name '*.tar.gz' | head -n 1)
459+
echo "Extracting: $tar_file"
460+
tar -xvf "$tar_file" -C dawn --strip-components=1
461+
462+
- name: Build
463+
id: cmake_build
464+
run: |
465+
export Dawn_DIR=dawn/lib64/cmake/Dawn
466+
cmake -B build -DGGML_WEBGPU=ON
467+
cmake --build build --config Release -j $(nproc)
468+
469+
- name: Test
470+
id: cmake_test
471+
run: |
472+
cd build
473+
# This is using llvmpipe and runs slower than other backends
474+
ctest -L main --verbose --timeout 3600
475+
347476
ubuntu-22-cmake-hip:
348477
runs-on: ubuntu-22.04
349478
container: rocm/dev-ubuntu-22.04:6.0.2

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
269269
| [Vulkan](docs/build.md#vulkan) | GPU |
270270
| [CANN](docs/build.md#cann) | Ascend NPU |
271271
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
272+
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
273+
272274
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
273275

274276
## Obtaining and quantizing models

ci/run.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
# # with VULKAN support
1717
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
1818
#
19+
# # with WebGPU support
20+
# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
21+
#
1922
# # with MUSA support
2023
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
2124
#
@@ -81,6 +84,10 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
8184
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
8285
fi
8386

87+
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
88+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
89+
fi
90+
8491
if [ ! -z ${GG_BUILD_MUSA} ]; then
8592
# Use qy1 by default (MTT S80)
8693
MUSA_ARCH=${MUSA_ARCH:-21}

common/arg.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14641464
params.swa_full = true;
14651465
}
14661466
).set_env("LLAMA_ARG_SWA_FULL"));
1467+
add_opt(common_arg(
1468+
{"--kv-unified", "-kvu"},
1469+
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
1470+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1471+
[](common_params & params) {
1472+
params.kv_unified = true;
1473+
}
1474+
).set_env("LLAMA_ARG_KV_SPLIT"));
14671475
add_opt(common_arg(
14681476
{"--no-context-shift"},
14691477
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -3423,5 +3431,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34233431
}
34243432
).set_examples({LLAMA_EXAMPLE_SERVER}));
34253433

3434+
// diffusion parameters
3435+
add_opt(common_arg(
3436+
{ "--diffusion-steps" }, "N",
3437+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3438+
[](common_params & params, int value) { params.diffusion.steps = value; }
3439+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3440+
add_opt(common_arg(
3441+
{ "--diffusion-eps" }, "F",
3442+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3443+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3444+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3445+
add_opt(common_arg(
3446+
{ "--diffusion-algorithm" }, "N",
3447+
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3448+
params.diffusion.algorithm),
3449+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
3450+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3451+
add_opt(common_arg(
3452+
{ "--diffusion-alg-temp" }, "F",
3453+
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3454+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3455+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3456+
add_opt(common_arg(
3457+
{ "--diffusion-visual" },
3458+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3459+
params.diffusion.visual_mode ? "true" : "false"),
3460+
[](common_params & params) { params.diffusion.visual_mode = true; }
3461+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3462+
34263463
return ctx_arg;
34273464
}

common/common.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,15 +1005,21 @@ struct common_init_result common_init_from_params(common_params & params) {
10051005
params.sampling.ignore_eos = false;
10061006
}
10071007

1008-
if (params.sampling.ignore_eos) {
1009-
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1010-
if (llama_vocab_is_eog(vocab, i)) {
1011-
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1012-
params.sampling.logit_bias.push_back({i, -INFINITY});
1013-
}
1008+
// initialize once
1009+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1010+
if (llama_vocab_is_eog(vocab, i)) {
1011+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1012+
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
10141013
}
10151014
}
10161015

1016+
if (params.sampling.ignore_eos) {
1017+
// add EOG biases to the active set of logit biases
1018+
params.sampling.logit_bias.insert(
1019+
params.sampling.logit_bias.end(),
1020+
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1021+
}
1022+
10171023
if (params.sampling.penalty_last_n == -1) {
10181024
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
10191025
params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -1157,6 +1163,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11571163
cparams.no_perf = params.no_perf;
11581164
cparams.op_offload = !params.no_op_offload;
11591165
cparams.swa_full = params.swa_full;
1166+
cparams.kv_unified = params.kv_unified;
11601167

11611168
cparams.type_k = params.cache_type_k;
11621169
cparams.type_v = params.cache_type_v;

common/common.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ enum llama_example {
8181
LLAMA_EXAMPLE_LOOKUP,
8282
LLAMA_EXAMPLE_PARALLEL,
8383
LLAMA_EXAMPLE_TTS,
84+
LLAMA_EXAMPLE_DIFFUSION,
8485

8586
LLAMA_EXAMPLE_COUNT,
8687
};
@@ -177,7 +178,8 @@ struct common_params_sampling {
177178
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
178179
std::set<llama_token> preserved_tokens;
179180

180-
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
181+
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
182+
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
181183

182184
// print the parameters into a string
183185
std::string print() const;
@@ -217,6 +219,14 @@ struct common_params_vocoder {
217219
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
218220
};
219221

222+
struct common_params_diffusion {
223+
int32_t steps = 64; // number of diffusion steps
224+
float eps = 1e-3f; // epsilon for timesteps
225+
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
226+
float alg_temp = 0.0f; // algorithm temperature
227+
bool visual_mode = false; // show progressive diffusion on screen
228+
};
229+
220230
enum common_reasoning_format {
221231
COMMON_REASONING_FORMAT_NONE,
222232
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
@@ -268,6 +278,7 @@ struct common_params {
268278
struct common_params_sampling sampling;
269279
struct common_params_speculative speculative;
270280
struct common_params_vocoder vocoder;
281+
struct common_params_diffusion diffusion;
271282

272283
struct common_params_model model;
273284

@@ -330,6 +341,7 @@ struct common_params {
330341
bool no_perf = false; // disable performance metrics
331342
bool ctx_shift = true; // context shift on inifinite text generation
332343
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
344+
bool kv_unified = false; // enable unified KV cache
333345

334346
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
335347
bool use_mmap = true; // use mmap for faster loads

0 commit comments

Comments
 (0)