Skip to content

Commit bd4a202

Browse files
committed
update deps for b5122
1 parent 7eab71d commit bd4a202

File tree

6 files changed

+79
-136
lines changed

6 files changed

+79
-136
lines changed

ThirdParty/LlamaCpp/Include/common/common.h

Lines changed: 18 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ using llama_tokens = std::vector<llama_token>;
3636

3737
// build info
3838
int LLAMA_BUILD_NUMBER = 0;
39-
const char* LLAMA_COMMIT = "ef19c71769681a0b3dde6bc90911728376e5d236";
40-
const char* LLAMA_COMPILER = "";
41-
const char* LLAMA_BUILD_TARGET = "Vulkan - Unreal";
39+
const char * LLAMA_COMMIT = "e59ea539b83d2c7947c99bd350549364dbba450c";
40+
const char * LLAMA_COMPILER = "";
41+
const char * LLAMA_BUILD_TARGET = "Vulkan - Unreal";
4242

4343
struct common_control_vector_load_info;
4444

@@ -121,10 +121,6 @@ struct common_grammar_trigger {
121121
common_grammar_trigger_type type;
122122
std::string value;
123123
llama_token token = LLAMA_TOKEN_NULL;
124-
125-
// T can only be nlohmann::ordered_json
126-
template <class T> T to_json() const;
127-
template <class T> static common_grammar_trigger from_json(const T & in);
128124
};
129125

130126
// sampling parameters
@@ -184,6 +180,13 @@ struct common_params_sampling {
184180
std::string print() const;
185181
};
186182

183+
struct common_params_model {
184+
std::string path = ""; // model local path // NOLINT
185+
std::string url = ""; // model url to download // NOLINT
186+
std::string hf_repo = ""; // HF repo // NOLINT
187+
std::string hf_file = ""; // HF file // NOLINT
188+
};
189+
187190
struct common_params_speculative {
188191
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
189192

@@ -197,19 +200,11 @@ struct common_params_speculative {
197200
struct cpu_params cpuparams;
198201
struct cpu_params cpuparams_batch;
199202

200-
std::string hf_repo = ""; // HF repo // NOLINT
201-
std::string hf_file = ""; // HF file // NOLINT
202-
203-
std::string model = ""; // draft model for speculative decoding // NOLINT
204-
std::string model_url = ""; // model url to download // NOLINT
203+
struct common_params_model model;
205204
};
206205

207206
struct common_params_vocoder {
208-
std::string hf_repo = ""; // HF repo // NOLINT
209-
std::string hf_file = ""; // HF file // NOLINT
210-
211-
std::string model = ""; // model path // NOLINT
212-
std::string model_url = ""; // model url to download // NOLINT
207+
struct common_params_model model;
213208

214209
std::string speaker_file = ""; // speaker file path // NOLINT
215210

@@ -267,12 +262,10 @@ struct common_params {
267262
struct common_params_speculative speculative;
268263
struct common_params_vocoder vocoder;
269264

270-
std::string model = ""; // model path // NOLINT
265+
struct common_params_model model;
266+
271267
std::string model_alias = ""; // model alias // NOLINT
272-
std::string model_url = ""; // model url to download // NOLINT
273268
std::string hf_token = ""; // HF token // NOLINT
274-
std::string hf_repo = ""; // HF repo // NOLINT
275-
std::string hf_file = ""; // HF file // NOLINT
276269
std::string prompt = ""; // NOLINT
277270
std::string system_prompt = ""; // NOLINT
278271
std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -286,6 +279,7 @@ struct common_params {
286279
std::vector<std::string> in_files; // all input files
287280
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
288281
std::vector<llama_model_kv_override> kv_overrides;
282+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
289283

290284
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
291285
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -347,7 +341,7 @@ struct common_params {
347341
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
348342

349343
// multimodal models (see examples/llava)
350-
std::string mmproj = ""; // path to multimodal projector // NOLINT
344+
struct common_params_model mmproj;
351345
std::vector<std::string> image; // path to image file(s)
352346

353347
// embedding
@@ -546,26 +540,11 @@ struct llama_model_params common_model_params_to_llama ( common_params
546540
struct llama_context_params common_context_params_to_llama(const common_params & params);
547541
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
548542

549-
struct llama_model * common_load_model_from_url(
550-
const std::string & model_url,
551-
const std::string & local_path,
552-
const std::string & hf_token,
553-
const struct llama_model_params & params);
554-
555-
struct llama_model * common_load_model_from_hf(
556-
const std::string & repo,
557-
const std::string & remote_path,
558-
const std::string & local_path,
559-
const std::string & hf_token,
560-
const struct llama_model_params & params);
561-
562-
std::pair<std::string, std::string> common_get_hf_file(
563-
const std::string & hf_repo_with_tag,
564-
const std::string & hf_token);
565-
566543
// clear LoRA adapters from context, then apply new list of adapters
567544
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
568545

546+
std::string get_model_endpoint();
547+
569548
//
570549
// Batch utils
571550
//

ThirdParty/LlamaCpp/Include/ggml-rpc.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
1717

1818
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
1919

20-
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
20+
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
21+
const char * cache_dir,
22+
size_t free_mem, size_t total_mem);
2123

2224
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
2325

ThirdParty/LlamaCpp/Include/ggml.h

Lines changed: 43 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -507,17 +507,12 @@ extern "C" {
507507

508508
GGML_OP_UNARY,
509509

510-
GGML_OP_MAP_UNARY,
511-
GGML_OP_MAP_BINARY,
512-
513-
GGML_OP_MAP_CUSTOM1_F32,
514-
GGML_OP_MAP_CUSTOM2_F32,
515-
GGML_OP_MAP_CUSTOM3_F32,
516-
517510
GGML_OP_MAP_CUSTOM1,
518511
GGML_OP_MAP_CUSTOM2,
519512
GGML_OP_MAP_CUSTOM3,
520513

514+
GGML_OP_CUSTOM,
515+
521516
GGML_OP_CROSS_ENTROPY_LOSS,
522517
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
523518
GGML_OP_OPT_STEP_ADAMW,
@@ -1722,24 +1717,29 @@ extern "C" {
17221717
float p0,
17231718
float p1);
17241719

1725-
// nearest interpolate
1720+
enum ggml_scale_mode {
1721+
GGML_SCALE_MODE_NEAREST = 0,
1722+
GGML_SCALE_MODE_BILINEAR = 1,
1723+
};
1724+
1725+
// interpolate
17261726
// multiplies ne0 and ne1 by scale factor
1727-
// used in stable-diffusion
17281727
GGML_API struct ggml_tensor * ggml_upscale(
17291728
struct ggml_context * ctx,
17301729
struct ggml_tensor * a,
1731-
int scale_factor);
1730+
int scale_factor,
1731+
enum ggml_scale_mode mode);
17321732

1733-
// nearest interpolate
1734-
// nearest interpolate to specified dimensions
1735-
// used in tortoise.cpp
1733+
// interpolate
1734+
// interpolate scale to specified dimensions
17361735
GGML_API struct ggml_tensor * ggml_upscale_ext(
17371736
struct ggml_context * ctx,
17381737
struct ggml_tensor * a,
17391738
int ne0,
17401739
int ne1,
17411740
int ne2,
1742-
int ne3);
1741+
int ne3,
1742+
enum ggml_scale_mode mode);
17431743

17441744
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
17451745
GGML_API struct ggml_tensor * ggml_pad(
@@ -1791,11 +1791,11 @@ extern "C" {
17911791

17921792
#define GGML_KQ_MASK_PAD 64
17931793

1794-
// q: [n_embd, n_batch, n_head, 1]
1795-
// k: [n_embd, n_kv, n_head_kv, 1]
1796-
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1797-
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1798-
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
1794+
// q: [n_embd_k, n_batch, n_head, 1]
1795+
// k: [n_embd_k, n_kv, n_head_kv, 1]
1796+
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1797+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1798+
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
17991799
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
18001800
struct ggml_context * ctx,
18011801
struct ggml_tensor * q,
@@ -1916,83 +1916,6 @@ extern "C" {
19161916

19171917
// custom operators
19181918

1919-
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1920-
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1921-
1922-
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1923-
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1924-
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1925-
1926-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1927-
struct ggml_context * ctx,
1928-
struct ggml_tensor * a,
1929-
ggml_unary_op_f32_t fun),
1930-
"use ggml_map_custom1 instead");
1931-
1932-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1933-
struct ggml_context * ctx,
1934-
struct ggml_tensor * a,
1935-
ggml_unary_op_f32_t fun),
1936-
"use ggml_map_custom1_inplace instead");
1937-
1938-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1939-
struct ggml_context * ctx,
1940-
struct ggml_tensor * a,
1941-
struct ggml_tensor * b,
1942-
ggml_binary_op_f32_t fun),
1943-
"use ggml_map_custom2 instead");
1944-
1945-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1946-
struct ggml_context * ctx,
1947-
struct ggml_tensor * a,
1948-
struct ggml_tensor * b,
1949-
ggml_binary_op_f32_t fun),
1950-
"use ggml_map_custom2_inplace instead");
1951-
1952-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1953-
struct ggml_context * ctx,
1954-
struct ggml_tensor * a,
1955-
ggml_custom1_op_f32_t fun),
1956-
"use ggml_map_custom1 instead");
1957-
1958-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1959-
struct ggml_context * ctx,
1960-
struct ggml_tensor * a,
1961-
ggml_custom1_op_f32_t fun),
1962-
"use ggml_map_custom1_inplace instead");
1963-
1964-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1965-
struct ggml_context * ctx,
1966-
struct ggml_tensor * a,
1967-
struct ggml_tensor * b,
1968-
ggml_custom2_op_f32_t fun),
1969-
"use ggml_map_custom2 instead");
1970-
1971-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1972-
struct ggml_context * ctx,
1973-
struct ggml_tensor * a,
1974-
struct ggml_tensor * b,
1975-
ggml_custom2_op_f32_t fun),
1976-
"use ggml_map_custom2_inplace instead");
1977-
1978-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1979-
struct ggml_context * ctx,
1980-
struct ggml_tensor * a,
1981-
struct ggml_tensor * b,
1982-
struct ggml_tensor * c,
1983-
ggml_custom3_op_f32_t fun),
1984-
"use ggml_map_custom3 instead");
1985-
1986-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1987-
struct ggml_context * ctx,
1988-
struct ggml_tensor * a,
1989-
struct ggml_tensor * b,
1990-
struct ggml_tensor * c,
1991-
ggml_custom3_op_f32_t fun),
1992-
"use ggml_map_custom3_inplace instead");
1993-
1994-
// custom operators v2
1995-
19961919
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
19971920
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
19981921
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
@@ -2048,6 +1971,30 @@ extern "C" {
20481971
int n_tasks,
20491972
void * userdata);
20501973

1974+
typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
1975+
1976+
GGML_API struct ggml_tensor * ggml_custom_4d(
1977+
struct ggml_context * ctx,
1978+
enum ggml_type type,
1979+
int64_t ne0,
1980+
int64_t ne1,
1981+
int64_t ne2,
1982+
int64_t ne3,
1983+
struct ggml_tensor ** args,
1984+
int n_args,
1985+
ggml_custom_op_t fun,
1986+
int n_tasks,
1987+
void * userdata);
1988+
1989+
GGML_API struct ggml_tensor * ggml_custom_inplace(
1990+
struct ggml_context * ctx,
1991+
struct ggml_tensor * a,
1992+
struct ggml_tensor ** args,
1993+
int n_args,
1994+
ggml_custom_op_t fun,
1995+
int n_tasks,
1996+
void * userdata);
1997+
20511998
// loss function
20521999

20532000
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(

ThirdParty/LlamaCpp/Include/llama.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ extern "C" {
108108
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
109109
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
110110
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
111+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
112+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
113+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
111114
};
112115

113116
enum llama_rope_type {
@@ -278,10 +281,18 @@ extern "C" {
278281
};
279282
};
280283

284+
struct llama_model_tensor_buft_override {
285+
const char * pattern;
286+
ggml_backend_buffer_type_t buft;
287+
};
288+
281289
struct llama_model_params {
282290
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
283291
ggml_backend_dev_t * devices;
284292

293+
// NULL-terminated list of buffer types to use for tensors that match a pattern
294+
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
295+
285296
int32_t n_gpu_layers; // number of layers to store in VRAM
286297
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
287298

@@ -1265,6 +1276,10 @@ extern "C" {
12651276
float tau,
12661277
float eta);
12671278

1279+
/// @details Intializes a GBNF grammar, see grammars/README.md for details.
1280+
/// @param vocab The vocabulary that this grammar will be used with.
1281+
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
1282+
/// @param grammar_root The name of the start symbol for the grammar.
12681283
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
12691284
const struct llama_vocab * vocab,
12701285
const char * grammar_str,
-225 KB
Binary file not shown.
-1.98 KB
Binary file not shown.

0 commit comments

Comments
 (0)