Skip to content

Commit adbff66

Browse files
committed
Merge branch 'master' into imatrix
2 parents c39c4e2 + 5aa1105 commit adbff66

File tree

143 files changed

+75245
-32778
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+75245
-32778
lines changed

.devops/cann.Dockerfile

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# ==============================================================================
2+
# ARGUMENTS
3+
# ==============================================================================
4+
5+
# Define the CANN base image for easier version updates later
6+
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
7+
8+
# ==============================================================================
9+
# BUILD STAGE
10+
# Compile all binary files and libraries
11+
# ==============================================================================
12+
FROM ${CANN_BASE_IMAGE} AS build
13+
14+
# Define the Ascend chip model for compilation. Default is Ascend910B3
15+
ARG ASCEND_SOC_TYPE=Ascend910B3
16+
17+
# -- Install build dependencies --
18+
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
19+
yum clean all && \
20+
rm -rf /var/cache/yum
21+
22+
# -- Set the working directory --
23+
WORKDIR /app
24+
25+
# -- Copy project files --
26+
COPY . .
27+
28+
# -- Set CANN environment variables (required for compilation) --
29+
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
30+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
31+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
32+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
33+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
34+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
35+
# ... You can add other environment variables from the original file as needed ...
36+
# For brevity, only core variables are listed here. You can paste the original ENV list here.
37+
38+
# -- Build llama.cpp --
39+
# Use the passed ASCEND_SOC_TYPE argument and add general build options
40+
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
41+
&& \
42+
cmake -B build \
43+
-DGGML_CANN=ON \
44+
-DCMAKE_BUILD_TYPE=Release \
45+
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
46+
. && \
47+
cmake --build build --config Release -j$(nproc)
48+
49+
# -- Organize build artifacts for copying in later stages --
50+
# Create a lib directory to store all .so files
51+
RUN mkdir -p /app/lib && \
52+
find build -name "*.so" -exec cp {} /app/lib \;
53+
54+
# Create a full directory to store all executables and Python scripts
55+
RUN mkdir -p /app/full && \
56+
cp build/bin/* /app/full/ && \
57+
cp *.py /app/full/ && \
58+
cp -r gguf-py /app/full/ && \
59+
cp -r requirements /app/full/ && \
60+
cp requirements.txt /app/full/
61+
# If you have a tools.sh script, make sure it is copied here
62+
# cp .devops/tools.sh /app/full/tools.sh
63+
64+
# ==============================================================================
65+
# BASE STAGE
66+
# Create a minimal base image with CANN runtime and common libraries
67+
# ==============================================================================
68+
FROM ${CANN_BASE_IMAGE} AS base
69+
70+
# -- Install runtime dependencies --
71+
RUN yum install -y libgomp curl && \
72+
yum clean all && \
73+
rm -rf /var/cache/yum
74+
75+
# -- Set CANN environment variables (required for runtime) --
76+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
77+
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
78+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
79+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
80+
# ... You can add other environment variables from the original file as needed ...
81+
82+
WORKDIR /app
83+
84+
# Copy compiled .so files from the build stage
85+
COPY --from=build /app/lib/ /app
86+
87+
# ==============================================================================
88+
# FINAL STAGES (TARGETS)
89+
# ==============================================================================
90+
91+
### Target: full
92+
# Complete image with all tools, Python bindings, and dependencies
93+
# ==============================================================================
94+
FROM base AS full
95+
96+
COPY --from=build /app/full /app
97+
98+
# Install Python dependencies
99+
RUN yum install -y git python3 python3-pip && \
100+
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
101+
pip3 install --no-cache-dir -r requirements.txt && \
102+
yum clean all && \
103+
rm -rf /var/cache/yum
104+
105+
# You need to provide a tools.sh script as the entrypoint
106+
ENTRYPOINT ["/app/tools.sh"]
107+
# If there is no tools.sh, you can set the default to start the server
108+
# ENTRYPOINT ["/app/llama-server"]
109+
110+
### Target: light
111+
# Lightweight image containing only llama-cli
112+
# ==============================================================================
113+
FROM base AS light
114+
115+
COPY --from=build /app/full/llama-cli /app
116+
117+
ENTRYPOINT [ "/app/llama-cli" ]
118+
119+
### Target: server
120+
# Dedicated server image containing only llama-server
121+
# ==============================================================================
122+
FROM base AS server
123+
124+
ENV LLAMA_ARG_HOST=0.0.0.0
125+
126+
COPY --from=build /app/full/llama-server /app
127+
128+
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
129+
130+
ENTRYPOINT [ "/app/llama-server" ]

.devops/rocm.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG UBUNTU_VERSION=24.04
22

33
# This needs to generally match the container host's environment.
4-
ARG ROCM_VERSION=6.3
5-
ARG AMDGPU_VERSION=6.3
4+
ARG ROCM_VERSION=6.4
5+
ARG AMDGPU_VERSION=6.4
66

77
# Target the CUDA build image
88
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Check Pre-Tokenizer Hashes
2+
3+
on:
4+
push:
5+
paths:
6+
- 'convert_hf_to_gguf.py'
7+
- 'convert_hf_to_gguf_update.py'
8+
pull_request:
9+
paths:
10+
- 'convert_hf_to_gguf.py'
11+
- 'convert_hf_to_gguf_update.py'
12+
13+
jobs:
14+
pre-tokenizer-hashes:
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- name: Checkout repository
19+
uses: actions/checkout@v4
20+
21+
- name: Set up Python
22+
uses: actions/setup-python@v5
23+
with:
24+
python-version: '3.11'
25+
26+
- name: Install Python dependencies
27+
run: |
28+
python3 -m venv .venv
29+
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
30+
31+
- name: Update pre-tokenizer hashes
32+
run: |
33+
cp convert_hf_to_gguf.py /tmp
34+
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
35+
36+
- name: Check if committed pre-tokenizer hashes matches generated version
37+
run: |
38+
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
39+
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
40+
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
41+
echo "Differences found:"
42+
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
43+
exit 1
44+
fi
45+
echo "Model pre-tokenizer hashes are up to date."

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ models/*
8282
models-mnt
8383
!models/.editorconfig
8484
!models/ggml-vocab-*.gguf*
85+
!models/templates
8586

8687
# Zig
8788
zig-out/

common/arg.cpp

Lines changed: 60 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
977977
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
978978
string_process_escapes(seq_breaker);
979979
}
980+
for (auto & pair : params.speculative.replacements) {
981+
string_process_escapes(pair.first);
982+
string_process_escapes(pair.second);
983+
}
980984
}
981985

982986
if (!params.kv_overrides.empty()) {
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20912095
params.no_kv_offload = true;
20922096
}
20932097
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
2098+
add_opt(common_arg(
2099+
{"-nr", "--no-repack"},
2100+
"disable weight repacking",
2101+
[](common_params & params) {
2102+
params.no_extra_bufts = true;
2103+
}
2104+
).set_env("LLAMA_ARG_NO_REPACK"));
20942105
add_opt(common_arg(
20952106
{"-ctk", "--cache-type-k"}, "TYPE",
20962107
string_format(
@@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23692380
}
23702381
}
23712382
));
2383+
add_opt(common_arg(
2384+
{"--cpu-moe"},
2385+
"use CPU for Mixture of Experts (MoE) weights",
2386+
[](common_params & params) {
2387+
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388+
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389+
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2390+
}
2391+
).set_env("LLAMA_ARG_CPU_MOE"));
23722392
add_opt(common_arg(
23732393
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23742394
"number of layers to store in VRAM",
@@ -2627,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26272647
params.n_out_freq = value;
26282648
}
26292649
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2650+
add_opt(common_arg(
2651+
{"--output-format"}, "{gguf,dat}",
2652+
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
2653+
[](common_params & params, const std::string & value) {
2654+
/**/ if (value == "gguf") { params.imat_dat = false; }
2655+
else if (value == "dat") { params.imat_dat = true; }
2656+
else { throw std::invalid_argument("invalid output format"); }
2657+
}
2658+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26302659
add_opt(common_arg(
26312660
{"--save-frequency"}, "N",
26322661
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
@@ -3249,6 +3278,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32493278
params.speculative.model.path = value;
32503279
}
32513280
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3281+
add_opt(common_arg(
3282+
{"--spec-replace"}, "TARGET", "DRAFT",
3283+
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3284+
[](common_params & params, const std::string & tgt, const std::string & dft) {
3285+
params.speculative.replacements.push_back({ tgt, dft });
3286+
}
3287+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
32523288
add_opt(common_arg(
32533289
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
32543290
string_format(
@@ -3438,34 +3474,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34383474
}
34393475
).set_examples({LLAMA_EXAMPLE_SERVER}));
34403476

3441-
// diffusion parameters
34423477
add_opt(common_arg(
34433478
{ "--diffusion-steps" }, "N",
34443479
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
34453480
[](common_params & params, int value) { params.diffusion.steps = value; }
34463481
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3482+
add_opt(common_arg(
3483+
{ "--diffusion-visual" },
3484+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3485+
params.diffusion.visual_mode ? "true" : "false"),
3486+
[](common_params & params) { params.diffusion.visual_mode = true; }
3487+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3488+
34473489
add_opt(common_arg(
34483490
{ "--diffusion-eps" }, "F",
34493491
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
34503492
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
34513493
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34523494
add_opt(common_arg(
34533495
{ "--diffusion-algorithm" }, "N",
3454-
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3496+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
34553497
params.diffusion.algorithm),
34563498
[](common_params & params, int value) { params.diffusion.algorithm = value; }
34573499
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34583500
add_opt(common_arg(
34593501
{ "--diffusion-alg-temp" }, "F",
3460-
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3502+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
34613503
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
34623504
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3505+
34633506
add_opt(common_arg(
3464-
{ "--diffusion-visual" },
3465-
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3466-
params.diffusion.visual_mode ? "true" : "false"),
3467-
[](common_params & params) { params.diffusion.visual_mode = true; }
3507+
{ "--diffusion-block-length" }, "N",
3508+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3509+
[](common_params & params, int value) { params.diffusion.block_length = value; }
3510+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3511+
add_opt(common_arg(
3512+
{ "--diffusion-cfg-scale" }, "F",
3513+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3514+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
34683515
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3516+
add_opt(common_arg(
3517+
{ "--diffusion-add-gumbel-noise" }, "F",
3518+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3519+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3520+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3521+
34693522

34703523
return ctx_arg;
34713524
}

common/chat.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
16461646
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
16471647
);
16481648

1649-
if (auto res = builder.try_find_regex(open_regex)) {
1649+
while (auto res = builder.try_find_regex(open_regex)) {
16501650
const auto & block_start = res->groups[1];
16511651
std::string block_end = block_start.empty() ? "" : "```";
16521652

@@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
16681668
builder.consume_literal(block_end);
16691669
builder.consume_spaces();
16701670
}
1671-
builder.add_content(builder.consume_rest());
16721671
} else {
16731672
throw common_chat_msg_partial_exception("failed to parse tool call");
16741673
}
@@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
16931692
builder.consume_spaces();
16941693
}
16951694
}
1696-
builder.add_content(builder.consume_rest());
16971695
}
1698-
} else {
1699-
builder.add_content(builder.consume_rest());
17001696
}
1697+
1698+
builder.add_content(builder.consume_rest());
17011699
}
17021700

17031701
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1944,6 +1942,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
19441942
}
19451943
}
19461944
auto msg = builder.result();
1947-
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1945+
if (!is_partial) {
1946+
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1947+
}
19481948
return msg;
19491949
}

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11221122
mparams.use_mmap = params.use_mmap;
11231123
mparams.use_mlock = params.use_mlock;
11241124
mparams.check_tensors = params.check_tensors;
1125+
mparams.use_extra_bufts = !params.no_extra_bufts;
11251126

11261127
if (params.kv_overrides.empty()) {
11271128
mparams.kv_overrides = NULL;

0 commit comments

Comments
 (0)