Skip to content

Commit ee6326a

Browse files
committed
Merge branch 'master' into dev-refactoring
2 parents 379bdeb + cd6983d commit ee6326a

File tree

148 files changed

+7110
-1418
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

148 files changed

+7110
-1418
lines changed

.github/workflows/build.yml

Lines changed: 16 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -159,31 +159,15 @@ jobs:
159159
- name: Dawn Dependency
160160
id: dawn-depends
161161
run: |
162-
ARTIFACTS_JSON=$(curl -s -L \
163-
-H "Accept: application/vnd.github+json" \
164-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
165-
-H "X-GitHub-Api-Version: 2022-11-28" \
166-
"https://api.github.com/repos/google/dawn/actions/artifacts")
167-
echo "Finding latest macos-latest-Release artifact..."
168-
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
169-
| sort_by(.created_at)
170-
| reverse
171-
| map(select(.name | test("macos-latest-Release$")))
172-
| .[0].archive_download_url')
173-
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
174-
echo "No suitable Dawn artifact found!"
175-
exit 1
176-
fi
177-
echo "Downloading from: $DOWNLOAD_URL"
178-
curl -L \
179-
-H "Accept: application/vnd.github+json" \
180-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
181-
-o artifact.zip "$DOWNLOAD_URL"
182-
unzip artifact.zip
162+
DAWN_VERSION="v1.0.0"
163+
DAWN_OWNER="reeselevine"
164+
DAWN_REPO="dawn"
165+
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
166+
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
167+
curl -L -o artifact.tar.gz \
168+
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
183169
mkdir dawn
184-
tar_file=$(find . -name '*.tar.gz' | head -n 1)
185-
echo "Extracting: $tar_file"
186-
tar -xvf "$tar_file" -C dawn --strip-components=1
170+
tar -xvf artifact.tar.gz -C dawn --strip-components=1
187171
188172
- name: Build
189173
id: cmake_build
@@ -433,31 +417,15 @@ jobs:
433417
id: dawn-depends
434418
run: |
435419
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
436-
ARTIFACTS_JSON=$(curl -s -L \
437-
-H "Accept: application/vnd.github+json" \
438-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
439-
-H "X-GitHub-Api-Version: 2022-11-28" \
440-
"https://api.github.com/repos/google/dawn/actions/artifacts")
441-
echo "Finding latest ubuntu-latest-Release artifact..."
442-
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
443-
| sort_by(.created_at)
444-
| reverse
445-
| map(select(.name | test("ubuntu-latest-Release$")))
446-
| .[0].archive_download_url')
447-
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
448-
echo "No suitable Dawn artifact found!"
449-
exit 1
450-
fi
451-
echo "Downloading from: $DOWNLOAD_URL"
452-
curl -L \
453-
-H "Accept: application/vnd.github+json" \
454-
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
455-
-o artifact.zip "$DOWNLOAD_URL"
456-
unzip artifact.zip
420+
DAWN_VERSION="v1.0.0"
421+
DAWN_OWNER="reeselevine"
422+
DAWN_REPO="dawn"
423+
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
424+
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
425+
curl -L -o artifact.tar.gz \
426+
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
457427
mkdir dawn
458-
tar_file=$(find . -name '*.tar.gz' | head -n 1)
459-
echo "Extracting: $tar_file"
460-
tar -xvf "$tar_file" -C dawn --strip-components=1
428+
tar -xvf artifact.tar.gz -C dawn --strip-components=1
461429
462430
- name: Build
463431
id: cmake_build
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Check Pre-Tokenizer Hashes
2+
3+
on:
4+
push:
5+
paths:
6+
- 'convert_hf_to_gguf.py'
7+
- 'convert_hf_to_gguf_update.py'
8+
pull_request:
9+
paths:
10+
- 'convert_hf_to_gguf.py'
11+
- 'convert_hf_to_gguf_update.py'
12+
13+
jobs:
14+
pre-tokenizer-hashes:
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- name: Checkout repository
19+
uses: actions/checkout@v4
20+
21+
- name: Set up Python
22+
uses: actions/setup-python@v5
23+
with:
24+
python-version: '3.11'
25+
26+
- name: Install Python dependencies
27+
run: |
28+
python3 -m venv .venv
29+
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
30+
31+
- name: Update pre-tokenizer hashes
32+
run: |
33+
cp convert_hf_to_gguf.py /tmp
34+
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
35+
36+
- name: Check if committed pre-tokenizer hashes matches generated version
37+
run: |
38+
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
39+
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
40+
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
41+
echo "Differences found:"
42+
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
43+
exit 1
44+
fi
45+
echo "Model pre-tokenizer hashes are up to date."

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ LLM inference in C/C++
1717

1818
## Hot topics
1919

20+
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
2021
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
2122
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
2223
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode

common/arg.cpp

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <cstdarg>
2525
#include <filesystem>
2626
#include <fstream>
27+
#include <list>
2728
#include <regex>
2829
#include <set>
2930
#include <string>
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23752376
}
23762377
throw std::invalid_argument("unknown buffer type");
23772378
}
2378-
// FIXME: this leaks memory
2379-
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2379+
// keep strings alive and avoid leaking memory by storing them in a static vector
2380+
static std::list<std::string> buft_overrides;
2381+
buft_overrides.push_back(tensor_name);
2382+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
23802383
}
23812384
}
23822385
));
23832386
add_opt(common_arg(
2384-
{"--cpu-moe"},
2385-
"use CPU for Mixture of Experts (MoE) weights",
2387+
{"--cpu-moe", "-cmoe"},
2388+
"keep all Mixture of Experts (MoE) weights in the CPU",
23862389
[](common_params & params) {
2387-
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388-
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389-
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2390+
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
23902391
}
23912392
).set_env("LLAMA_ARG_CPU_MOE"));
2393+
add_opt(common_arg(
2394+
{"--n-cpu-moe", "-ncmoe"}, "N",
2395+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2396+
[](common_params & params, int value) {
2397+
if (value < 0) {
2398+
throw std::invalid_argument("invalid value");
2399+
}
2400+
for (int i = 0; i < value; ++i) {
2401+
// keep strings alive and avoid leaking memory by storing them in a static vector
2402+
static std::list<std::string> buft_overrides;
2403+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2404+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2405+
}
2406+
}
2407+
).set_env("LLAMA_ARG_N_CPU_MOE"));
23922408
add_opt(common_arg(
23932409
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23942410
"number of layers to store in VRAM",
@@ -2647,6 +2663,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26472663
params.n_out_freq = value;
26482664
}
26492665
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2666+
add_opt(common_arg(
2667+
{"--output-format"}, "{gguf,dat}",
2668+
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
2669+
[](common_params & params, const std::string & value) {
2670+
/**/ if (value == "gguf") { params.imat_dat = -1; }
2671+
else if (value == "dat") { params.imat_dat = 1; }
2672+
else { throw std::invalid_argument("invalid output format"); }
2673+
}
2674+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26502675
add_opt(common_arg(
26512676
{"--save-frequency"}, "N",
26522677
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
@@ -2922,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29222947
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
29232948
"- none: leaves thoughts unparsed in `message.content`\n"
29242949
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2925-
"(default: deepseek)",
2950+
"(default: auto)",
29262951
[](common_params & params, const std::string & value) {
29272952
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
29282953
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
29292954
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2955+
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
29302956
else { throw std::invalid_argument("invalid value"); }
29312957
}
29322958
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));

common/chat-parser.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
5555
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
5656
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
5757
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
58-
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
58+
std::string arguments = "";
59+
if (tool_call.contains("arguments")) {
60+
if (tool_call.at("arguments").is_object()) {
61+
arguments = tool_call.at("arguments").dump();
62+
} else {
63+
arguments = tool_call.at("arguments");
64+
}
65+
}
66+
5967
return add_tool_call(name, id, arguments);
6068
}
6169

0 commit comments

Comments
 (0)