Skip to content

Commit f3471ce

Browse files
authored
Merge pull request #162 from menloresearch/update-dev-from-master-2025-07-10-00-09
Sync master with upstream release b5857
2 parents 57cd1b0 + 3762fb3 commit f3471ce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+2457
-723
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ jobs:
342342
cd build
343343
export GGML_VK_VISIBLE_DEVICES=0
344344
# This is using llvmpipe and runs slower than other backends
345-
ctest -L main --verbose --timeout 3600
345+
ctest -L main --verbose --timeout 4200
346346
347347
ubuntu-22-cmake-hip:
348348
runs-on: ubuntu-22.04

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27342734
params.public_path = value;
27352735
}
27362736
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2737+
add_opt(common_arg(
2738+
{"--api-prefix"}, "PREFIX",
2739+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2740+
[](common_params & params, const std::string & value) {
2741+
params.api_prefix = value;
2742+
}
2743+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
27372744
add_opt(common_arg(
27382745
{"--no-webui"},
27392746
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ struct common_params {
370370

371371
std::string hostname = "127.0.0.1";
372372
std::string public_path = ""; // NOLINT
373+
std::string api_prefix = ""; // NOLINT
373374
std::string chat_template = ""; // NOLINT
374375
bool use_jinja = false; // NOLINT
375376
bool enable_chat_template = true;

convert_hf_to_gguf.py

Lines changed: 413 additions & 5 deletions
Large diffs are not rendered by default.

convert_hf_to_gguf_update.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):
128128
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
129129
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
130130
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
131+
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
131132
]
132133

133134
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -137,6 +138,12 @@ class TOKENIZER_TYPE(IntEnum):
137138
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
138139
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
139140
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
141+
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
142+
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
143+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
144+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
145+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
146+
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
140147
]
141148

142149

docs/development/HOWTO-add-model.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
8383

8484
### 2. Define the model architecture in `llama.cpp`
8585

86-
The model params and tensors layout must be defined in `llama.cpp`:
87-
1. Define a new `llm_arch`
88-
2. Define the tensors layout in `LLM_TENSOR_NAMES`
89-
3. Add any non-standard metadata in `llm_load_hparams`
90-
4. Create the tensors for inference in `llm_load_tensors`
91-
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
86+
The model params and tensors layout must be defined in `llama.cpp` source files:
87+
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
88+
2. In `src/llama-arch.cpp`:
89+
- Add the architecture name to the `LLM_ARCH_NAMES` map.
90+
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
91+
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
92+
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
9293

9394
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
9495

9596
### 3. Build the GGML graph implementation
9697

97-
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
98-
99-
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
98+
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
99+
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
100+
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
101+
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
100102

101103
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
102104

ggml/include/ggml.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ extern "C" {
495495
GGML_OP_POOL_1D,
496496
GGML_OP_POOL_2D,
497497
GGML_OP_POOL_2D_BACK,
498-
GGML_OP_UPSCALE, // nearest interpolate
498+
GGML_OP_UPSCALE,
499499
GGML_OP_PAD,
500500
GGML_OP_PAD_REFLECT_1D,
501501
GGML_OP_ROLL,
@@ -1297,6 +1297,19 @@ extern "C" {
12971297
struct ggml_tensor * a,
12981298
float s);
12991299

1300+
// x = s * a + b
1301+
GGML_API struct ggml_tensor * ggml_scale_bias(
1302+
struct ggml_context * ctx,
1303+
struct ggml_tensor * a,
1304+
float s,
1305+
float b);
1306+
1307+
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1308+
struct ggml_context * ctx,
1309+
struct ggml_tensor * a,
1310+
float s,
1311+
float b);
1312+
13001313
// b -> view(a,offset,nb1,nb2,3), return modified a
13011314
GGML_API struct ggml_tensor * ggml_set(
13021315
struct ggml_context * ctx,

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2188,7 +2188,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
21882188
case GGML_OP_MUL:
21892189
case GGML_OP_DIV:
21902190
case GGML_OP_RMS_NORM:
2191-
case GGML_OP_SCALE:
21922191
case GGML_OP_SQR:
21932192
case GGML_OP_SQRT:
21942193
case GGML_OP_CLAMP:
@@ -2210,6 +2209,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
22102209
case GGML_OP_PAD_REFLECT_1D:
22112210
case GGML_OP_COUNT_EQUAL:
22122211
return true;
2212+
case GGML_OP_SCALE:
2213+
float bias;
2214+
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
2215+
return bias == 0.0f; // TODO: support bias != 0.0f
22132216
case GGML_OP_SOFT_MAX:
22142217
// TODO: support broadcast
22152218
// ref: https://github.com/ggml-org/llama.cpp/pull/14435

ggml/src/ggml-cpu/ops.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4643,9 +4643,11 @@ static void ggml_compute_forward_scale_f32(
46434643
GGML_ASSERT(ggml_is_contiguous(dst));
46444644
GGML_ASSERT(ggml_are_same_shape(src0, dst));
46454645

4646-
// scale factor
4647-
float v;
4648-
memcpy(&v, dst->op_params, sizeof(float));
4646+
float s; // scale factor
4647+
float b; // bias
4648+
4649+
memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
4650+
memcpy(&b, (float *) dst->op_params + 1, sizeof(float));
46494651

46504652
const int ith = params->ith;
46514653
const int nth = params->nth;
@@ -4664,12 +4666,22 @@ static void ggml_compute_forward_scale_f32(
46644666

46654667
const size_t nb1 = dst->nb[1];
46664668

4667-
for (int i1 = ir0; i1 < ir1; i1++) {
4668-
if (dst->data != src0->data) {
4669-
// src0 is same shape as dst => same indices
4670-
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
4669+
if (b == 0.0f) {
4670+
for (int i1 = ir0; i1 < ir1; i1++) {
4671+
if (dst->data != src0->data) {
4672+
// src0 is same shape as dst => same indices
4673+
// TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
4674+
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
4675+
}
4676+
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
4677+
}
4678+
} else {
4679+
for (int i1 = ir0; i1 < ir1; i1++) {
4680+
ggml_vec_mad1_f32(nc,
4681+
(float *) ((char *) dst->data + i1*nb1),
4682+
(float *) ((char *) src0->data + i1*nb1),
4683+
s, b);
46714684
}
4672-
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
46734685
}
46744686
}
46754687

ggml/src/ggml-cpu/vec.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
351351
#endif
352352
}
353353

354+
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
355+
#if defined(GGML_USE_ACCELERATE)
356+
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
357+
#elif defined(GGML_SIMD)
358+
#if defined(__ARM_FEATURE_SVE)
359+
// scalar ; TODO: Write SVE code
360+
for (int i = 0; i < n; ++i) {
361+
y[i] = x[i]*s + b;
362+
}
363+
#else
364+
const int np = (n & ~(GGML_F32_STEP - 1));
365+
366+
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
367+
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
368+
369+
GGML_F32_VEC ay[GGML_F32_ARR];
370+
371+
for (int i = 0; i < np; i += GGML_F32_STEP) {
372+
for (int j = 0; j < GGML_F32_ARR; j++) {
373+
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
374+
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
375+
376+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
377+
}
378+
}
379+
380+
// leftovers
381+
for (int i = np; i < n; ++i) {
382+
y[i] = x[i]*s + b;
383+
}
384+
#endif
385+
#else
386+
// scalar
387+
for (int i = 0; i < n; ++i) {
388+
y[i] = x[i]*s + b;
389+
}
390+
#endif
391+
}
392+
354393
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
355394
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
356395
#if defined(GGML_USE_ACCELERATE)

0 commit comments

Comments
 (0)