diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 1a5de5928a526..ffb2de65fb7d7 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -43,7 +43,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); - if (model == NULL) { + if (model == nullptr) { LOG_ERR("%s: error: unable to load model\n" , __func__); return 1; } @@ -76,7 +76,7 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); - if (ctx == NULL) { + if (ctx == nullptr) { LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); return 1; } diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 767198aafa21c..722f547321020 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -254,7 +254,7 @@ struct my_llama_layer { }; struct my_llama_model { - struct ggml_context * ctx = NULL; + struct ggml_context * ctx = nullptr; std::string name; @@ -333,7 +333,7 @@ static void print_params(struct my_llama_hparams * params) { } static void print_tensor_info(const struct ggml_context * ctx) { - for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + for (auto * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { LOG_INF("%s: Allocating ", __func__); int64_t total = 1; int i = 0; @@ -443,7 +443,7 @@ struct my_llama_file { my_llama_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); - if (fp == NULL) { + if (fp == nullptr) { size = 0; } else { seek(0, SEEK_END); @@ -530,7 +530,7 @@ static std::string llama_escape_whitespaces(const std::string & text) { static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) { if (is_ggml_file(filename)) { LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename); - struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_data = nullptr; struct gguf_init_params params = { /*.no_alloc = */ false, @@ -538,7 +538,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l }; struct gguf_context * ctx = gguf_init_from_file(filename, params); - GGML_ASSERT(ctx != NULL); + GGML_ASSERT(ctx != nullptr); const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL); GGML_ASSERT(model_idx >= 0); @@ -925,7 +925,7 @@ int main(int argc, char ** argv) { struct ggml_init_params lcparams; lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); - lcparams.mem_buffer = NULL; + lcparams.mem_buffer = nullptr; lcparams.no_alloc = false; model.ctx = ggml_init(lcparams); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 9ae7e4dbb0592..145a902edd0c3 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -57,12 +57,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu // try to get token embeddings embd = llama_get_embeddings_ith(ctx, i); embd_pos = i; - GGML_ASSERT(embd != NULL && "failed to get token embeddings"); + GGML_ASSERT(embd != nullptr && "failed to get token embeddings"); } else { // try to get sequence embeddings - supported only when pooling_type is not NONE embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); embd_pos = batch.seq_id[i][0]; - GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); + GGML_ASSERT(embd != nullptr && "failed to get sequence embeddings"); } float * out = output + embd_pos * n_embd; @@ -107,7 +107,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - if (model == NULL) { + if (model == nullptr) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 9523ec122f573..20591dd60c8b6 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -285,7 +285,7 @@ static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char u static hash_exit_code_t gguf_hash(const hash_params & hash_params) { const std::string & fname = hash_params.input; - struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_data = nullptr; struct gguf_init_params params = { /*.no_alloc = */ false, @@ -293,10 +293,10 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { }; // xxh64 init - XXH64_state_t* xxh64_model_hash_state = NULL; + XXH64_state_t* xxh64_model_hash_state = nullptr; if (hash_params.xxh64) { xxh64_model_hash_state = XXH64_createState(); - if (xxh64_model_hash_state==NULL) { + if (xxh64_model_hash_state==nullptr) { abort(); } diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c55c6..3ff9045b39b99 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -40,7 +40,7 @@ static bool gguf_ex_write(const std::string & fname) { struct ggml_init_params params = { /*.mem_size =*/ 128ull*1024ull*1024ull, - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ false, }; @@ -86,7 +86,7 @@ static bool gguf_ex_write(const std::string & fname) { static bool gguf_ex_read_0(const std::string & fname) { struct gguf_init_params params = { /*.no_alloc = */ false, - /*.ctx = */ NULL, + /*.ctx = */ nullptr, }; struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); @@ -148,7 +148,7 @@ static bool gguf_ex_read_0(const std::string & fname) { // read and create ggml_context containing the tensors and their data static bool gguf_ex_read_1(const std::string & fname, bool check_data) { - struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_data = nullptr; struct gguf_init_params params = { /*.no_alloc = */ false, diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 711ddc5d19587..f8d61d66b10f3 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -152,7 +152,7 @@ Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) { extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) { - llama_log_set(log_callback, NULL); + llama_log_set(log_callback, nullptr); } extern "C" diff --git a/examples/model-conversion/logits.cpp b/examples/model-conversion/logits.cpp index ddc5e9005f9e0..945d9a1087aac 100644 --- a/examples/model-conversion/logits.cpp +++ b/examples/model-conversion/logits.cpp @@ -78,27 +78,27 @@ int main(int argc, char ** argv) { llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); - if (model == NULL) { + if (model == nullptr) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } // Extract basename from model_path const char * basename = strrchr(model_path.c_str(), '/'); - basename = (basename == NULL) ? model_path.c_str() : basename + 1; + basename = (basename == nullptr) ? model_path.c_str() : basename + 1; char model_name[256]; strncpy(model_name, basename, 255); model_name[255] = '\0'; char * dot = strrchr(model_name, '.'); - if (dot != NULL && strcmp(dot, ".gguf") == 0) { + if (dot != nullptr && strcmp(dot, ".gguf") == 0) { *dot = '\0'; } printf("Model name: %s\n", model_name); const llama_vocab * vocab = llama_model_get_vocab(model); - const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true); + const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), nullptr, 0, true, true); std::vector prompt_tokens(n_prompt); if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { @@ -117,7 +117,7 @@ int main(int argc, char ** argv) { } llama_context * ctx = llama_init_from_model(model, ctx_params); - if (ctx == NULL) { + if (ctx == nullptr) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); return 1; } @@ -167,7 +167,7 @@ int main(int argc, char ** argv) { printf("Saving logits to %s\n", bin_filename); FILE * f = fopen(bin_filename, "wb"); - if (f == NULL) { + if (f == nullptr) { fprintf(stderr, "%s: error: failed to open binary output file\n", __func__); return 1; } @@ -178,7 +178,7 @@ int main(int argc, char ** argv) { char txt_filename[512]; snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type); f = fopen(txt_filename, "w"); - if (f == NULL) { + if (f == nullptr) { fprintf(stderr, "%s: error: failed to open text output file\n", __func__); return 1; } diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 8a4faa383bf32..97891c34dc7d0 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -66,7 +66,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); - if (model == NULL) { + if (model == nullptr) { LOG_ERR("%s: unable to load model\n" , __func__); return 1; } @@ -82,7 +82,7 @@ int main(int argc, char ** argv) { GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); llama_context * ctx = llama_init_from_model(model, ctx_params); - if (ctx == NULL) { + if (ctx == nullptr) { LOG_ERR("%s: failed to create the llama_context\n" , __func__); return 1; } diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 042e12c2bf83a..944b664705d6a 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -98,9 +98,9 @@ static void batch_process(llama_context * ctx, llama_batch & batch, float * outp // try to get sequence embeddings - supported only when pooling_type is not NONE const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { + if (embd == nullptr) { embd = llama_get_embeddings_ith(ctx, i); - if (embd == NULL) { + if (embd == nullptr) { LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i); continue; } @@ -154,7 +154,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - if (model == NULL) { + if (model == nullptr) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 57195df331628..07af83ed644c6 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -101,7 +101,7 @@ int main(int argc, char ** argv) { const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1; // tokenize the prompt - const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); + const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), nullptr, 0, is_first, true); std::vector prompt_tokens(n_prompt_tokens); if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) { GGML_ABORT("failed to tokenize the prompt\n"); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 633b87e58406e..15fba0f9a08a6 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -85,7 +85,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); - if (model == NULL) { + if (model == nullptr) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } @@ -94,7 +94,7 @@ int main(int argc, char ** argv) { // tokenize the prompt // find the number of tokens in the prompt - const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true); + const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), nullptr, 0, true, true); // allocate space for the tokens and tokenize the prompt std::vector prompt_tokens(n_prompt); @@ -115,7 +115,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_init_from_model(model, ctx_params); - if (ctx == NULL) { + if (ctx == nullptr) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); return 1; } diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index a8e53f28eb597..4f625e9f93f35 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -33,11 +33,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model_tgt = NULL; - //llama_model * model_dft = NULL; + llama_model * model_tgt = nullptr; + //llama_model * model_dft = nullptr; - llama_context * ctx_tgt = NULL; - llama_context * ctx_dft = NULL; + llama_context * ctx_tgt = nullptr; + llama_context * ctx_dft = nullptr; // load the target model common_init_result llama_init_tgt = common_init_from_params(params); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 5f5ac5eb64d38..53d1435f37b99 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -64,11 +64,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model_tgt = NULL; - llama_model * model_dft = NULL; + llama_model * model_tgt = nullptr; + llama_model * model_dft = nullptr; - llama_context * ctx_tgt = NULL; - llama_context * ctx_dft = NULL; + llama_context * ctx_tgt = nullptr; + llama_context * ctx_dft = nullptr; // load the target model common_init_result llama_init_tgt = common_init_from_params(params); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index 416d8d8f6c8f3..d19180f7b1a2d 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -43,7 +43,7 @@ int main(int argc, char ** argv) { llama_model_ptr & model = llama_init.model; llama_context_ptr & ctx = llama_init.context; - if (model == NULL) { + if (model == nullptr) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index d8eef75a7ad70..a8599d0572d3c 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -42,7 +42,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { if (it == ctx_map.end()) { ggml_init_params params = { /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; @@ -248,7 +248,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ // add a new context ggml_init_params params = { /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; ggml_context * buft_ctx = ggml_init(params); diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 55d89eca0ad94..d1b0172593cb1 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -80,7 +80,7 @@ bool llama_batch_allocr::init( if (!batch.seq_id) { seq_id.resize(batch.n_tokens + 1); - seq_id[batch.n_tokens] = NULL; + seq_id[batch.n_tokens] = nullptr; for (int32_t i = 0; i < batch.n_tokens; i++) { seq_id[i] = seq_id_0.data(); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 6b3188be4bc01..f01f474e5c577 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1120,7 +1120,7 @@ int llama_context::decode(const llama_batch & batch_inp) { // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { - // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + // ggml_graph_dump_dot(gf, nullptr, "llama.dot"); //} auto * t_logits = res->get_logits(); @@ -2287,7 +2287,7 @@ llama_context * llama_init_from_model( llama_model * model, llama_context_params params) { if (!model) { - LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__); + LLAMA_LOG_ERROR("%s: model cannot be nullptr\n", __func__); return nullptr; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 4abb6008dd184..5501053a79635 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -744,7 +744,7 @@ ggml_tensor * llm_graph_context::build_ffn( } else { cur = ggml_gelu(ctx0, cur); cb(cur, "ffn_gelu", il); - if (act_scales != NULL) { + if (act_scales != nullptr) { cur = ggml_div(ctx0, cur, act_scales); cb(cur, "ffn_act", il); } diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd323a6..a78cb6afa27a6 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -84,7 +84,7 @@ std::string format(const char * fmt, ...) { va_list ap2; va_start(ap, fmt); va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); + int size = vsnprintf(nullptr, 0, fmt, ap); GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT std::vector buf(size + 1); int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index ae35f74201e9c..7e38e88cc095c 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -44,7 +44,7 @@ llama_kv_cache::llama_kv_cache( if (it == ctx_map.end()) { ggml_init_params params = { /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()), - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index 08716ed91aed1..5bce436e0ee7e 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -39,7 +39,7 @@ llama_memory_recurrent::llama_memory_recurrent( if (it == ctx_map.end()) { ggml_init_params params = { /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 47497cf953fd3..01d3b483ad563 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -44,7 +44,7 @@ static std::string llama_format_win_err(DWORD err) { LPSTR buf; size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + nullptr, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, nullptr); if (!size) { return "FormatMessageA failed"; } @@ -61,9 +61,9 @@ struct llama_file::impl { HANDLE fp_win32; std::string GetErrorMessageWin32(DWORD error_code) const { std::string ret; - LPSTR lpMsgBuf = NULL; + LPSTR lpMsgBuf = nullptr; DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL); + nullptr, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, nullptr); if (!bufLen) { ret = format("Win32 error code: %lx", error_code); } else { @@ -76,7 +76,7 @@ struct llama_file::impl { impl(const char * fname, const char * mode) { fp = ggml_fopen(fname, mode); - if (fp == NULL) { + if (fp == nullptr) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp)); @@ -103,7 +103,7 @@ struct llama_file::impl { LARGE_INTEGER li; li.QuadPart = offset; - BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence); + BOOL ret = SetFilePointerEx(fp_win32, li, nullptr, whence); if (!ret) { throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str())); } @@ -114,7 +114,7 @@ struct llama_file::impl { while (bytes_read < len) { size_t chunk_size = std::min(len - bytes_read, 64*1024*1024); DWORD chunk_read = 0; - BOOL result = ReadFile(fp_win32, reinterpret_cast(ptr) + bytes_read, chunk_size, &chunk_read, NULL); + BOOL result = ReadFile(fp_win32, reinterpret_cast(ptr) + bytes_read, chunk_size, &chunk_read, nullptr); if (!result) { throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str())); } @@ -137,7 +137,7 @@ struct llama_file::impl { while (bytes_written < len) { size_t chunk_size = std::min(len - bytes_written, 64*1024*1024); DWORD chunk_written = 0; - BOOL result = WriteFile(fp_win32, reinterpret_cast(ptr) + bytes_written, chunk_size, &chunk_written, NULL); + BOOL result = WriteFile(fp_win32, reinterpret_cast(ptr) + bytes_written, chunk_size, &chunk_written, nullptr); if (!result) { throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str())); } @@ -161,7 +161,7 @@ struct llama_file::impl { #else impl(const char * fname, const char * mode) { fp = ggml_fopen(fname, mode); - if (fp == NULL) { + if (fp == nullptr) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } seek(0, SEEK_END); @@ -285,7 +285,7 @@ struct llama_mmap::impl { } if (prefetch) { flags |= MAP_POPULATE; } #endif - addr = mmap(NULL, file->size(), PROT_READ, flags, fd, 0); + addr = mmap(nullptr, file->size(), PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } @@ -369,9 +369,9 @@ struct llama_mmap::impl { HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id()); - HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + HANDLE hMapping = CreateFileMappingA(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr); - if (hMapping == NULL) { + if (hMapping == nullptr) { DWORD error = GetLastError(); throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); } @@ -380,7 +380,7 @@ struct llama_mmap::impl { DWORD error = GetLastError(); CloseHandle(hMapping); - if (addr == NULL) { + if (addr == nullptr) { throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } @@ -554,10 +554,10 @@ struct llama_mlock::impl { static void raw_unlock(const void * addr, size_t len) {} #endif - impl() : addr(NULL), size(0), failed_already(false) {} + impl() : addr(nullptr), size(0), failed_already(false) {} void init(void * ptr) { - GGML_ASSERT(addr == NULL && size == 0); + GGML_ASSERT(addr == nullptr && size == 0); addr = ptr; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 8182a9adf53a6..74fc5b3d7e96d 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -487,7 +487,7 @@ llama_model_loader::llama_model_loader( tensor_buft_overrides = param_tensor_buft_overrides_p; // Load the main GGUF - struct ggml_context * ctx = NULL; + struct ggml_context * ctx = nullptr; struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, @@ -760,9 +760,9 @@ struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector & ne, bool required) const { const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); - if (cur == NULL) { + if (cur == nullptr) { if (!required) { - return NULL; + return nullptr; } throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } @@ -791,8 +791,8 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str()); const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); - if (cur == NULL) { - return NULL; + if (cur == nullptr) { + return nullptr; } bool duplicated = flags & TENSOR_DUPLICATED; @@ -813,8 +813,8 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required) { const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); - if (cur == NULL) { - return NULL; + if (cur == nullptr) { + return nullptr; } if (cur->type != base->type) { @@ -1017,7 +1017,7 @@ bool llama_model_loader::load_all_data( ggml_backend_name(upload_backend)); } - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != nullptr; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { // this can happen with split experts models diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1813f06d7b308..24e74994b860f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -161,7 +161,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_init_params params = { /*.mem_size =*/ ggml_tensor_overhead()*8, - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; ggml_context_ptr ctx_ptr { ggml_init(params) }; @@ -2055,7 +2055,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (it == ctx_map.end()) { ggml_init_params params = { /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; @@ -2259,8 +2259,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -2322,8 +2322,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); } @@ -2369,8 +2369,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -2419,8 +2419,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -2490,8 +2490,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -2532,8 +2532,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -2829,8 +2829,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -2973,8 +2973,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3057,8 +3057,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3088,8 +3088,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3173,8 +3173,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3280,8 +3280,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3339,8 +3339,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3370,8 +3370,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if tok embd is NULL, init from output - if (tok_embd == NULL) { + // if tok embd is nullptr, init from output + if (tok_embd == nullptr) { tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3512,8 +3512,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3545,8 +3545,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_altup = hparams.n_embd_altup; output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3603,8 +3603,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3654,8 +3654,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed, duplicated to allow offloading - if (output == NULL) { + // if output is nullptr, init from the input tok embed, duplicated to allow offloading + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -3702,8 +3702,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed, duplicated to allow offloading - if (output == NULL) { + // if output is nullptr, init from the input tok embed, duplicated to allow offloading + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } } @@ -3748,8 +3748,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed, duplicated to allow offloading - if (output == NULL) { + // if output is nullptr, init from the input tok embed, duplicated to allow offloading + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } } @@ -3833,8 +3833,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed, duplicated to allow offloading - if (output == NULL) { + // if output is nullptr, init from the input tok embed, duplicated to allow offloading + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } } @@ -3989,8 +3989,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4045,8 +4045,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4176,8 +4176,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4409,8 +4409,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4462,8 +4462,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4525,8 +4525,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4562,8 +4562,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4608,8 +4608,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); } @@ -4748,8 +4748,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed, duplicated to allow offloading - if (output == NULL) { + // if output is nullptr, init from the input tok embed, duplicated to allow offloading + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } } @@ -4807,8 +4807,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4837,8 +4837,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -4900,7 +4900,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED); layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED); layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED); - GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL)); + GGML_ASSERT(!(layer.time_mix_lerp_fused == nullptr && layer.time_mix_lerp_w == nullptr)); layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0); layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0); @@ -5124,8 +5124,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5353,8 +5353,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5384,8 +5384,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5457,8 +5457,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED); } @@ -5513,8 +5513,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5550,8 +5550,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5584,8 +5584,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5651,7 +5651,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - if (output == NULL) { + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5691,8 +5691,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); - // if output is NULL, init from the input tok embed - if (output == NULL) { + // if output is nullptr, init from the input tok embed + if (output == nullptr) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } @@ -5758,7 +5758,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // check if it is possible to use buffer_from_host_ptr with this buffer type ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); if (!dev) { - // FIXME: workaround for CPU backend buft having a NULL device + // FIXME: workaround for CPU backend buft having a nullptr device dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!dev) { throw std::runtime_error(format("%s: no CPU backend found", __func__)); @@ -5840,7 +5840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // populate tensors_by_name for (auto & ctx : pimpl->ctxs) { - for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) { + for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != nullptr; cur = ggml_get_next_tensor(ctx.get(), cur)) { tensors_by_name.emplace_back(ggml_get_name(cur), cur); } } @@ -5849,7 +5849,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { + if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : nullptr, params.progress_callback, params.progress_callback_user_data)) { return false; } } @@ -6065,7 +6065,7 @@ template static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) { ggml_init_params params = { /*.mem_size =*/ ggml_tensor_overhead()*8, - /*.mem_buffer =*/ NULL, + /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; @@ -6176,7 +6176,7 @@ struct llm_build_llama : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -6245,21 +6245,21 @@ struct llm_build_llama : public llm_graph_context { if (model.layers[il].ffn_gate_inp == nullptr) { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -6290,7 +6290,7 @@ struct llm_build_llama : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -6338,7 +6338,7 @@ struct llm_build_llama_iswa : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -6418,20 +6418,20 @@ struct llm_build_llama_iswa : public llm_graph_context { // feed-forward network (non-MoE) if (model.layers[il].ffn_gate_inp == nullptr) { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { ggml_tensor * ffn_inp_normed = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -6449,10 +6449,10 @@ struct llm_build_llama_iswa : public llm_graph_context { // Shared experts ggml_tensor * shexp_out = build_ffn(ffn_inp_normed, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(shexp_out, "ffn_moe_shexp", il); @@ -6473,7 +6473,7 @@ struct llm_build_llama_iswa : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -6522,7 +6522,7 @@ struct llm_build_deci : public llm_graph_context { } else { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } @@ -6603,15 +6603,15 @@ struct llm_build_deci : public llm_graph_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -6629,7 +6629,7 @@ struct llm_build_deci : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -6668,7 +6668,7 @@ struct llm_build_baichuan : public llm_graph_context { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -6711,7 +6711,7 @@ struct llm_build_baichuan : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6726,15 +6726,15 @@ struct llm_build_baichuan : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -6751,7 +6751,7 @@ struct llm_build_baichuan : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -6790,7 +6790,7 @@ struct llm_build_xverse : public llm_graph_context { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -6826,7 +6826,7 @@ struct llm_build_xverse : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6841,15 +6841,15 @@ struct llm_build_xverse : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -6865,7 +6865,7 @@ struct llm_build_xverse : public llm_graph_context { cur = inpL; - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -6949,7 +6949,7 @@ struct llm_build_falcon : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6964,10 +6964,10 @@ struct llm_build_falcon : public llm_graph_context { // feed forward { cur = build_ffn(attn_norm, // !! use the attn norm, not the result - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } @@ -7029,7 +7029,7 @@ struct llm_build_grok : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -7092,7 +7092,7 @@ struct llm_build_grok : public llm_graph_context { // if attn_out_norm is present then apply it before adding the input if (model.layers[il].attn_out_norm) { cur = build_norm(cur, - model.layers[il].attn_out_norm, NULL, + model.layers[il].attn_out_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_out_norm", il); } @@ -7103,7 +7103,7 @@ struct llm_build_grok : public llm_graph_context { // feed-forward network // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -7125,7 +7125,7 @@ struct llm_build_grok : public llm_graph_context { // Idea: maybe ffn_out_norm is a better name if (model.layers[il].layer_out_norm) { cur = build_norm(cur, - model.layers[il].layer_out_norm, NULL, + model.layers[il].layer_out_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "layer_out_norm", il); } @@ -7143,7 +7143,7 @@ struct llm_build_grok : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -7189,7 +7189,7 @@ struct llm_build_dbrx : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM, il); cb(cur, "attn_norm", il); @@ -7228,7 +7228,7 @@ struct llm_build_dbrx : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7243,7 +7243,7 @@ struct llm_build_dbrx : public llm_graph_context { // feed-forward network // MoE branch cur = build_norm(ffn_inp, - model.layers[il].attn_out_norm, NULL, + model.layers[il].attn_out_norm, nullptr, LLM_NORM, il); cb(cur, "attn_out_norm", il); @@ -7273,7 +7273,7 @@ struct llm_build_dbrx : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM, -1); cb(cur, "result_norm", -1); @@ -7364,10 +7364,10 @@ struct llm_build_starcoder : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } @@ -7417,7 +7417,7 @@ struct llm_build_refact : public llm_graph_context { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -7441,7 +7441,7 @@ struct llm_build_refact : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7456,15 +7456,15 @@ struct llm_build_refact : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -7481,7 +7481,7 @@ struct llm_build_refact : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -7646,26 +7646,26 @@ struct llm_build_bert : public llm_graph_context { cb(cur, "ffn_moe_out", il); } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) { cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -7713,7 +7713,7 @@ struct llm_build_neo_bert : public llm_graph_context { // pre-norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); { @@ -7767,16 +7767,16 @@ struct llm_build_neo_bert : public llm_graph_context { // pre-norm cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network cur = build_ffn(cur, model.layers[il].ffn_up, - NULL, NULL, NULL, NULL, NULL, + nullptr, nullptr, nullptr, nullptr, nullptr, model.layers[il].ffn_down, - NULL, NULL, NULL, + nullptr, nullptr, nullptr, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); // attentions bypass the intermediate layer @@ -7789,7 +7789,7 @@ struct llm_build_neo_bert : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm_enc, NULL, + model.output_norm_enc, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_embd", -1); @@ -7871,10 +7871,10 @@ struct llm_build_bloom : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } @@ -8018,9 +8018,9 @@ struct llm_build_mpt : public llm_graph_context { LLM_NORM, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); @@ -8113,7 +8113,7 @@ struct llm_build_stablelm : public llm_graph_context { if (model.layers[il].attn_q_norm) { Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, - NULL, + nullptr, LLM_NORM, il); cb(Qcur, "Qcur", il); } @@ -8121,7 +8121,7 @@ struct llm_build_stablelm : public llm_graph_context { if (model.layers[il].attn_k_norm) { Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, - NULL, + nullptr, LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -8143,7 +8143,7 @@ struct llm_build_stablelm : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8169,10 +8169,10 @@ struct llm_build_stablelm : public llm_graph_context { cur = inpSA; } cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -8228,7 +8228,7 @@ struct llm_build_qwen : public llm_graph_context { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -8264,7 +8264,7 @@ struct llm_build_qwen : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8279,15 +8279,15 @@ struct llm_build_qwen : public llm_graph_context { // feed-forward forward { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -8304,7 +8304,7 @@ struct llm_build_qwen : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -8344,7 +8344,7 @@ struct llm_build_qwen2 : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -8398,15 +8398,15 @@ struct llm_build_qwen2 : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -8422,7 +8422,7 @@ struct llm_build_qwen2 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -8467,7 +8467,7 @@ struct llm_build_dream : public llm_graph_context { ggml_tensor * inpSA = inpL; // norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -8513,11 +8513,11 @@ struct llm_build_dream : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cur = build_ffn(cur, model.layers[il].ffn_up, nullptr, nullptr, model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); @@ -8531,7 +8531,7 @@ struct llm_build_dream : public llm_graph_context { cur = inpL; - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -8572,7 +8572,7 @@ struct llm_build_llada : public llm_graph_context { ggml_tensor * inpSA = inpL; // norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -8601,7 +8601,7 @@ struct llm_build_llada : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } @@ -8614,11 +8614,11 @@ struct llm_build_llada : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cur = build_ffn(cur, model.layers[il].ffn_up, nullptr, nullptr, model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); @@ -8632,7 +8632,7 @@ struct llm_build_llada : public llm_graph_context { cur = inpL; - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -8674,7 +8674,7 @@ struct llm_build_qwen2vl : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -8728,15 +8728,15 @@ struct llm_build_qwen2vl : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -8752,7 +8752,7 @@ struct llm_build_qwen2vl : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -8792,7 +8792,7 @@ struct llm_build_qwen2moe : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -8855,7 +8855,7 @@ struct llm_build_qwen2moe : public llm_graph_context { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -8883,10 +8883,10 @@ struct llm_build_qwen2moe : public llm_graph_context { cb(cur_gate, "ffn_shexp_gate", il); ggml_tensor * cur_ffn = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur_ffn, "ffn_shexp", il); @@ -8911,7 +8911,7 @@ struct llm_build_qwen2moe : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -8951,7 +8951,7 @@ struct llm_build_qwen3 : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -8971,7 +8971,7 @@ struct llm_build_qwen3 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); Qcur = ggml_rope_ext( @@ -8980,7 +8980,7 @@ struct llm_build_qwen3 : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Kcur = ggml_rope_ext( @@ -9008,15 +9008,15 @@ struct llm_build_qwen3 : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -9032,7 +9032,7 @@ struct llm_build_qwen3 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -9072,7 +9072,7 @@ struct llm_build_qwen3moe : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -9092,7 +9092,7 @@ struct llm_build_qwen3moe : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); Qcur = ggml_rope_ext( @@ -9101,7 +9101,7 @@ struct llm_build_qwen3moe : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Kcur = ggml_rope_ext( @@ -9129,7 +9129,7 @@ struct llm_build_qwen3moe : public llm_graph_context { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -9160,7 +9160,7 @@ struct llm_build_qwen3moe : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -9268,10 +9268,10 @@ struct llm_build_phi2 : public llm_graph_context { // FF { ffn_output = build_ffn(attn_norm_output, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(ffn_output, "ffn_out", il); } @@ -9412,10 +9412,10 @@ struct llm_build_phi3 : public llm_graph_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } else { @@ -9487,7 +9487,7 @@ struct llm_build_plamo : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -9526,7 +9526,7 @@ struct llm_build_plamo : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -9543,10 +9543,10 @@ struct llm_build_plamo : public llm_graph_context { // feed-forward network { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -9564,7 +9564,7 @@ struct llm_build_plamo : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -9656,10 +9656,10 @@ struct llm_build_gpt2 : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } @@ -9770,10 +9770,10 @@ struct llm_build_codeshell : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } @@ -9877,7 +9877,7 @@ struct llm_build_orion : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -9896,10 +9896,10 @@ struct llm_build_orion : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -9955,7 +9955,7 @@ struct llm_build_internlm2 : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -10018,15 +10018,15 @@ struct llm_build_internlm2 : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -10042,7 +10042,7 @@ struct llm_build_internlm2 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -10093,19 +10093,19 @@ struct llm_build_minicpm3 : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { - ggml_tensor * q = NULL; + ggml_tensor * q = nullptr; // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); q = build_norm(q, - model.layers[il].attn_q_a_norm, NULL, + model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il); cb(q, "q", il); @@ -10145,7 +10145,7 @@ struct llm_build_minicpm3 : public llm_graph_context { cb(k_pe, "k_pe", il); kv_compressed = build_norm(kv_compressed, - model.layers[il].attn_kv_a_norm, NULL, + model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); @@ -10192,7 +10192,7 @@ struct llm_build_minicpm3 : public llm_graph_context { cb(k_states, "k_states", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il); } @@ -10212,15 +10212,15 @@ struct llm_build_minicpm3 : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -10241,7 +10241,7 @@ struct llm_build_minicpm3 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -10284,7 +10284,7 @@ struct llm_build_gemma : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -10322,7 +10322,7 @@ struct llm_build_gemma : public llm_graph_context { cb(Qcur, "Qcur_scaled", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } @@ -10335,17 +10335,17 @@ struct llm_build_gemma : public llm_graph_context { cb(sa_out, "sa_out", il); cur = build_norm(sa_out, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -10362,7 +10362,7 @@ struct llm_build_gemma : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -10400,7 +10400,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -10437,7 +10437,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } @@ -10447,7 +10447,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { } cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, + model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); @@ -10455,23 +10455,23 @@ struct llm_build_gemma2_iswa : public llm_graph_context { cb(sa_out, "sa_out", il); cur = build_norm(sa_out, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, + model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); @@ -10487,7 +10487,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -10536,7 +10536,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { const float freq_scale_l = model.get_rope_freq_scale(cparams, il); // norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -10555,7 +10555,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); Qcur = ggml_rope_ext( @@ -10563,7 +10563,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Kcur = ggml_rope_ext( @@ -10579,7 +10579,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } @@ -10589,7 +10589,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { } cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, + model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); @@ -10597,23 +10597,23 @@ struct llm_build_gemma3_iswa : public llm_graph_context { cb(sa_out, "sa_out", il); cur = build_norm(sa_out, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, + model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); @@ -10629,7 +10629,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -10713,7 +10713,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(cur, "active_prediction", il); // norm - cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(cur, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // laurel @@ -10735,8 +10735,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps); cb(Qcur, "Qcur_normed", il); @@ -10757,7 +10757,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(Kcur, "Kcur_pos", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); } else { // reuse KV cache of earlier layers @@ -10765,7 +10765,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(Qcur, "Qcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); Qcur = ggml_rope_ext( @@ -10775,12 +10775,12 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(Qcur, "Qcur_pos", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); } cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, + model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); @@ -10793,7 +10793,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(attn_laurel, "attn_laurel", il); cur = build_norm(attn_laurel, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -10814,7 +10814,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { } cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, + model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", il); @@ -10836,7 +10836,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens] first_prediction = build_norm(first_prediction, - model.layers[il].per_layer_post_norm, NULL, + model.layers[il].per_layer_post_norm, nullptr, LLM_NORM_RMS, il); cb(first_prediction, "first_prediction_out", il); } @@ -10896,7 +10896,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { } cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -10960,7 +10960,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale); per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens); per_layer_proj = build_norm(per_layer_proj, - model.per_layer_proj_norm, NULL, + model.per_layer_proj_norm, nullptr, LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens] cb(per_layer_proj, "per_layer_proj", -1); @@ -10979,7 +10979,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ggml_tensor * tmp = cur; tmp = build_lora_mm(model.layers[il].laurel_l, tmp); tmp = build_lora_mm(model.layers[il].laurel_r, tmp); - tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il); + tmp = build_norm(tmp, model.layers[il].laurel_post_norm, nullptr, LLM_NORM_RMS, il); tmp = ggml_add(ctx0, tmp, cur); cb(tmp, "laurel_out", il); return tmp; @@ -11006,7 +11006,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { // output shape: [n_altup, n_tokens] ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) { ggml_tensor * router_inputs = build_norm(x, - model.layers[il].altup_router_norm, NULL, + model.layers[il].altup_router_norm, nullptr, LLM_NORM_RMS, il); // router_input_scale @@ -11094,7 +11094,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context { const float freq_scale_l = model.get_rope_freq_scale(cparams, il); // norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -11113,7 +11113,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); Qcur = ggml_rope_ext( @@ -11121,7 +11121,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context { n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Kcur = ggml_rope_ext( @@ -11137,7 +11137,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); } @@ -11147,7 +11147,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context { } cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, + model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); @@ -11155,23 +11155,23 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context { cb(sa_out, "sa_out", il); cur = build_norm(sa_out, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, + model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); @@ -11187,7 +11187,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -11291,10 +11291,10 @@ struct llm_build_starcoder2 : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); @@ -11416,9 +11416,9 @@ struct llm_graph_context_mamba : public llm_graph_context { // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) { - dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il); - B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il); - C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il); + dt = build_norm(dt, layer.ssm_dt_norm, nullptr, LLM_NORM_RMS, il); + B = build_norm(B, layer.ssm_b_norm, nullptr, LLM_NORM_RMS, il); + C = build_norm(C, layer.ssm_c_norm, nullptr, LLM_NORM_RMS, il); } // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} @@ -11583,7 +11583,7 @@ struct llm_graph_context_mamba : public llm_graph_context { // grouped RMS norm if (model.layers[il].ssm_norm) { y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs); - y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); + y = build_norm(y, model.layers[il].ssm_norm, nullptr, LLM_NORM_RMS, il); } y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs); @@ -11615,7 +11615,7 @@ struct llm_build_mamba : public llm_graph_context_mamba { for (int il = 0; il < n_layer; ++il) { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -11641,7 +11641,7 @@ struct llm_build_mamba : public llm_graph_context_mamba { } // final rmsnorm - cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(inpL, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -11674,7 +11674,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { for (int il = 0; il < n_layer; ++il) { const int64_t n_head_kv = hparams.n_head_kv(il); - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); if (n_head_kv == 0) { @@ -11700,8 +11700,8 @@ struct llm_build_jamba : public llm_graph_context_mamba { // No RoPE :) cur = build_attn(inp_hybrid->get_attn(), - model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -11713,17 +11713,17 @@ struct llm_build_jamba : public llm_graph_context_mamba { struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur); cb(cur, "ffn_inp", il); - cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { // FFN cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { @@ -11753,7 +11753,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { } // final rmsnorm - cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(inpL, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -11791,7 +11791,7 @@ struct llm_build_command_r : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM, il); cb(cur, "attn_norm", il); @@ -11828,7 +11828,7 @@ struct llm_build_command_r : public llm_graph_context { if (model.layers[il].attn_q_norm) { Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, - NULL, + nullptr, LLM_NORM, il); cb(Qcur, "Qcur", il); } @@ -11842,7 +11842,7 @@ struct llm_build_command_r : public llm_graph_context { if (model.layers[il].attn_k_norm) { Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, - NULL, + nullptr, LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -11873,10 +11873,10 @@ struct llm_build_command_r : public llm_graph_context { // feed-forward network { cur = build_ffn(ffn_inp, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -11895,7 +11895,7 @@ struct llm_build_command_r : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM, -1); cb(cur, "result_norm", -1); @@ -11939,7 +11939,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { const bool is_swa = hparams.is_swa(il); // norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM, il); cb(cur, "attn_norm", il); ggml_tensor * ffn_inp = cur; @@ -12007,8 +12007,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context { // feed-forward network { - cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, - NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, nullptr, nullptr, model.layers[il].ffn_gate, + nullptr, nullptr, model.layers[il].ffn_down, nullptr, nullptr, nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -12026,7 +12026,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { cur = inpL; - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -12075,7 +12075,7 @@ struct llm_build_olmo : public llm_graph_context { // norm cur = build_norm(inpL, - NULL, NULL, + nullptr, nullptr, LLM_NORM, il); cb(cur, "attn_norm", il); @@ -12138,15 +12138,15 @@ struct llm_build_olmo : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - NULL, NULL, + nullptr, nullptr, LLM_NORM, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -12163,7 +12163,7 @@ struct llm_build_olmo : public llm_graph_context { cur = inpL; cur = build_norm(cur, - NULL, NULL, + nullptr, nullptr, LLM_NORM, -1); cb(cur, "result_norm", -1); @@ -12215,11 +12215,11 @@ struct llm_build_olmo2 : public llm_graph_context { ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); @@ -12244,7 +12244,7 @@ struct llm_build_olmo2 : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12254,7 +12254,7 @@ struct llm_build_olmo2 : public llm_graph_context { } cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, + model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); @@ -12263,15 +12263,15 @@ struct llm_build_olmo2 : public llm_graph_context { // feed-forward network cur = build_ffn(ffn_inp, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, + model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); @@ -12288,7 +12288,7 @@ struct llm_build_olmo2 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -12332,7 +12332,7 @@ struct llm_build_olmoe : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -12348,11 +12348,11 @@ struct llm_build_olmoe : public llm_graph_context { ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); @@ -12377,7 +12377,7 @@ struct llm_build_olmoe : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12391,7 +12391,7 @@ struct llm_build_olmoe : public llm_graph_context { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -12420,7 +12420,7 @@ struct llm_build_olmoe : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -12463,7 +12463,7 @@ struct llm_build_openelm : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -12484,23 +12484,23 @@ struct llm_build_openelm : public llm_graph_context { cb(Vcur, "Vcur", il); Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, NULL, + model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur", il); Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, NULL, + model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur", il); Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, NULL, + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, NULL, + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -12510,7 +12510,7 @@ struct llm_build_openelm : public llm_graph_context { cb(Qcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12525,15 +12525,15 @@ struct llm_build_openelm : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -12550,7 +12550,7 @@ struct llm_build_openelm : public llm_graph_context { // norm cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -12645,10 +12645,10 @@ struct llm_build_gptneox : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); @@ -12677,10 +12677,10 @@ struct llm_build_gptneox : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); @@ -12735,7 +12735,7 @@ struct llm_build_arctic : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -12772,7 +12772,7 @@ struct llm_build_arctic : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12786,15 +12786,15 @@ struct llm_build_arctic : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -12803,7 +12803,7 @@ struct llm_build_arctic : public llm_graph_context { // MoE cur = build_norm(inpSA, - model.layers[il].ffn_norm_exps, NULL, + model.layers[il].ffn_norm_exps, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm_exps", il); @@ -12833,7 +12833,7 @@ struct llm_build_arctic : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -12875,7 +12875,7 @@ struct llm_build_deepseek : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -12940,16 +12940,16 @@ struct llm_build_deepseek : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { @@ -12971,10 +12971,10 @@ struct llm_build_deepseek : public llm_graph_context { // FFN shared expert { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); @@ -12995,7 +12995,7 @@ struct llm_build_deepseek : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -13050,13 +13050,13 @@ struct llm_build_deepseek2 : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { - ggml_tensor * q = NULL; + ggml_tensor * q = nullptr; if (!is_lite) { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); @@ -13155,7 +13155,7 @@ struct llm_build_deepseek2 : public llm_graph_context { // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il); } else { ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr); @@ -13189,7 +13189,7 @@ struct llm_build_deepseek2 : public llm_graph_context { // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } } @@ -13203,16 +13203,16 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { @@ -13234,10 +13234,10 @@ struct llm_build_deepseek2 : public llm_graph_context { // FFN shared expert { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); @@ -13258,7 +13258,7 @@ struct llm_build_deepseek2 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -13296,7 +13296,7 @@ struct llm_build_bitnet : public llm_graph_context { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -13356,11 +13356,11 @@ struct llm_build_bitnet : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - NULL, NULL, + nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cur = build_norm(cur, - model.layers[il].attn_sub_norm, NULL, + model.layers[il].attn_sub_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_sub_norm", il); @@ -13384,20 +13384,20 @@ struct llm_build_bitnet : public llm_graph_context { // feed-forward forward cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, - model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, - NULL, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, model.layers[il].ffn_up_scale, + model.layers[il].ffn_gate, nullptr, model.layers[il].ffn_gate_scale, + nullptr, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_sub_out", il); cur = build_norm(cur, - model.layers[il].ffn_sub_norm, NULL, + model.layers[il].ffn_sub_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_sub_norm", il); @@ -13417,7 +13417,7 @@ struct llm_build_bitnet : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -13456,7 +13456,7 @@ struct llm_build_t5_enc : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm_enc, NULL, + model.layers[il].attn_norm_enc, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -13495,16 +13495,16 @@ struct llm_build_t5_enc : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm_enc, NULL, + model.layers[il].ffn_norm_enc, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated cur = build_ffn(cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, + model.layers[il].ffn_up_enc, nullptr, nullptr, + model.layers[il].ffn_gate_enc, nullptr, nullptr, + model.layers[il].ffn_down_enc, nullptr, nullptr, + nullptr, model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, il); @@ -13525,7 +13525,7 @@ struct llm_build_t5_enc : public llm_graph_context { cb(cur, "result_embd", -1); cur = build_norm(cur, - model.output_norm_enc, NULL, + model.output_norm_enc, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -13562,7 +13562,7 @@ struct llm_build_t5_dec : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -13597,7 +13597,7 @@ struct llm_build_t5_dec : public llm_graph_context { // norm cur = build_norm(cur, - model.layers[il].attn_norm_cross, NULL, + model.layers[il].attn_norm_cross, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm_cross", il); @@ -13659,16 +13659,16 @@ struct llm_build_t5_dec : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, il); @@ -13689,7 +13689,7 @@ struct llm_build_t5_dec : public llm_graph_context { cb(cur, "result_embd", -1); cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -13771,10 +13771,10 @@ struct llm_build_jais : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -13824,7 +13824,7 @@ struct llm_build_chatglm : public llm_graph_context { cur = build_norm(inpL, model.layers[il].attn_norm, - NULL, + nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -13881,7 +13881,7 @@ struct llm_build_chatglm : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -13898,15 +13898,15 @@ struct llm_build_chatglm : public llm_graph_context { { cur = build_norm(ffn_inp, model.layers[il].ffn_norm, - NULL, + nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); @@ -13918,7 +13918,7 @@ struct llm_build_chatglm : public llm_graph_context { cur = build_norm(inpL, model.output_norm, - NULL, + nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -13958,7 +13958,7 @@ struct llm_build_glm4 : public llm_graph_context { // Pre-attention norm cur = build_norm(inpL, model.layers[il].attn_norm, - NULL, + nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -14014,7 +14014,7 @@ struct llm_build_glm4 : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -14026,7 +14026,7 @@ struct llm_build_glm4 : public llm_graph_context { // Post-attention norm (new!) cur = build_norm(cur, model.layers[il].attn_post_norm, - NULL, + nullptr, LLM_NORM_RMS, il); cb(cur, "post_attn_norm", il); @@ -14039,23 +14039,23 @@ struct llm_build_glm4 : public llm_graph_context { // Pre-MLP norm cur = build_norm(ffn_inp, model.layers[il].ffn_norm, - NULL, + nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // MLP cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); // Post-MLP norm cur = build_norm(cur, model.layers[il].ffn_post_norm, - NULL, + nullptr, LLM_NORM_RMS, il); cb(cur, "post_mlp_norm", il); } @@ -14068,7 +14068,7 @@ struct llm_build_glm4 : public llm_graph_context { // Final norm cur = build_norm(inpL, model.output_norm, - NULL, + nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -14109,7 +14109,7 @@ struct llm_build_glm4_moe : public llm_graph_context { ggml_tensor * inpSA = inpL; // Pre-attention norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -14138,11 +14138,11 @@ struct llm_build_glm4_moe : public llm_graph_context { // Apply Q/K norm if available (GLM-4.5 355B variant) if (model.layers[il].attn_q_norm) { - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); } if (model.layers[il].attn_k_norm) { - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); } @@ -14163,7 +14163,7 @@ struct llm_build_glm4_moe : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -14176,17 +14176,17 @@ struct llm_build_glm4_moe : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // Post-attention norm - cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "post_attn_norm", il); // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense) if (static_cast(il) < hparams.n_layer_dense_lead) { // Dense FFN layer cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { @@ -14206,10 +14206,10 @@ struct llm_build_glm4_moe : public llm_graph_context { // Process shared expert on original input ggml_tensor * shared_out = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(shared_out, "ffn_shexp_out", il); @@ -14228,7 +14228,7 @@ struct llm_build_glm4_moe : public llm_graph_context { } cur = inpL; - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -14337,10 +14337,10 @@ struct llm_build_nemotron : public llm_graph_context { cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); cur = ggml_add(ctx0, cur, ffn_inp); @@ -14395,7 +14395,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -14425,7 +14425,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -14490,10 +14490,10 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba { const int il) { cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -14528,7 +14528,7 @@ struct llm_build_exaone : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -14594,15 +14594,15 @@ struct llm_build_exaone : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -14619,7 +14619,7 @@ struct llm_build_exaone : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -14687,8 +14687,8 @@ struct llm_build_exaone4 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); cb(Kcur, "Kcur_normed", il); @@ -14711,7 +14711,7 @@ struct llm_build_exaone4 : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } @@ -14722,7 +14722,7 @@ struct llm_build_exaone4 : public llm_graph_context { } cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, + model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); @@ -14731,15 +14731,15 @@ struct llm_build_exaone4 : public llm_graph_context { // feed-forward network cur = build_ffn(ffn_inp, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, + model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); @@ -14755,7 +14755,7 @@ struct llm_build_exaone4 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -15131,15 +15131,15 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -15482,15 +15482,15 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -15549,7 +15549,7 @@ struct llm_build_granite : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -15573,7 +15573,7 @@ struct llm_build_granite : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -15669,22 +15669,22 @@ struct llm_build_granite : public llm_graph_context { if (model.layers[il].ffn_gate_inp == nullptr) { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -15704,10 +15704,10 @@ struct llm_build_granite : public llm_graph_context { // For Granite MoE Shared if (hparams.n_ff_shexp > 0) { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); @@ -15761,7 +15761,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -15790,7 +15790,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -15888,22 +15888,22 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { if (model.layers[il].ffn_gate_inp == nullptr) { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -15923,10 +15923,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { // For Granite MoE Shared if (hparams.n_ff_shexp > 0) { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); @@ -15984,7 +15984,7 @@ struct llm_build_chameleon : public llm_graph_context { cur = inpL; } else { cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } @@ -16061,7 +16061,7 @@ struct llm_build_chameleon : public llm_graph_context { if (hparams.swin_norm) { cur = build_norm(cur, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); } @@ -16071,22 +16071,22 @@ struct llm_build_chameleon : public llm_graph_context { // feed-forward network if (!hparams.swin_norm) { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); if (hparams.swin_norm) { cur = build_norm(cur, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } @@ -16104,7 +16104,7 @@ struct llm_build_chameleon : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -16252,10 +16252,10 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context { LLM_NORM, -1); cur = build_ffn(cur, - layer.pw1, layer.pw1_b, NULL, - NULL, NULL, NULL, - layer.pw2, layer.pw2_b, NULL, - NULL, + layer.pw1, layer.pw1_b, nullptr, + nullptr, nullptr, nullptr, + layer.pw2, layer.pw2_b, nullptr, + nullptr, LLM_FFN_GELU, LLM_FFN_SEQ, il); cur = ggml_mul(ctx0, cur, layer.gamma); @@ -16312,13 +16312,13 @@ struct llm_build_plm : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { - ggml_tensor * q = NULL; + ggml_tensor * q = nullptr; q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(q, "q", il); @@ -16354,7 +16354,7 @@ struct llm_build_plm : public llm_graph_context { cb(k_pe, "k_pe", il); kv_compressed = build_norm(kv_compressed, - model.layers[il].attn_kv_a_norm, NULL, + model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); @@ -16406,7 +16406,7 @@ struct llm_build_plm : public llm_graph_context { cb(k_states, "k_states", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il); } @@ -16419,15 +16419,15 @@ struct llm_build_plm : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); @@ -16443,7 +16443,7 @@ struct llm_build_plm : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -16477,7 +16477,7 @@ struct llm_build_bailingmoe : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -16542,7 +16542,7 @@ struct llm_build_bailingmoe : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -16563,10 +16563,10 @@ struct llm_build_bailingmoe : public llm_graph_context { // FFN shared expert { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); @@ -16586,7 +16586,7 @@ struct llm_build_bailingmoe : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -16626,7 +16626,7 @@ struct llm_build_dots1 : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -16646,7 +16646,7 @@ struct llm_build_dots1 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); Qcur = ggml_rope_ext( @@ -16655,7 +16655,7 @@ struct llm_build_dots1 : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Kcur = ggml_rope_ext( @@ -16683,16 +16683,16 @@ struct llm_build_dots1 : public llm_graph_context { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { @@ -16712,10 +16712,10 @@ struct llm_build_dots1 : public llm_graph_context { { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); @@ -16736,7 +16736,7 @@ struct llm_build_dots1 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -16775,7 +16775,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { // norm { cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } @@ -16824,7 +16824,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -16841,15 +16841,15 @@ struct llm_build_ernie4_5 : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -16866,7 +16866,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -16907,7 +16907,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { // norm { cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } @@ -16957,7 +16957,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { cb(Vcur, "Vcur", il); cur = build_attn(inp_attn, - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } @@ -16975,21 +16975,21 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { if (!is_moe_layer) { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } else { // MoE branch cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); @@ -17009,10 +17009,10 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { // Shared expert (if present) if (hparams.n_ff_shexp > 0) { ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(ffn_shexp, "ffn_shexp", il); @@ -17036,7 +17036,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -17075,7 +17075,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -17110,12 +17110,12 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { cb(Vcur, "Vcur-post-rope", il); ggml_tensor * attn_out = build_attn(inp->get_attn(), - model.layers[il].wo, NULL, + model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(attn_out, "attn_out", il); cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); // Mamba2 layer cb(cur, "ssm_in", il); @@ -17138,15 +17138,15 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -17162,7 +17162,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -17200,7 +17200,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { // cb(model.layers[il].attn_norm, "attn_norm", il); // pre_mixer_norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); // check if this layer is Mamba or Attention bool is_mamba_layer = hparams.is_recurrent(il); @@ -17214,7 +17214,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { } // post_mixer_norm - cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); // residual connection @@ -17223,20 +17223,20 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { residual = cur; // pre-ffn norm - cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(cur, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_pre_norm", il); // feed-forward network cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); // post ffn norm - cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_post_norm", il); if (il == n_layer - 1 && inp_out_ids) { @@ -17254,7 +17254,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { cur = inpL; // final norm - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head @@ -17303,7 +17303,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens); - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); Qcur = ggml_rope_ext( @@ -17312,7 +17312,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ext_factor, attn_factor, beta_fast, beta_slow ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Kcur = ggml_rope_ext( @@ -17322,8 +17322,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { ); cur = build_attn(inp, - model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il); + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head_v)), il); } cb(cur, "attn_out", il); @@ -17424,9 +17424,9 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { cb(dt, "mamba_dt_raw", il); // Apply RMS norm to dt, B, C (PLaMo-2 specific) - B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il); - C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il); - dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il); + B = build_norm(B, model.layers[il].ssm_b_norm, nullptr, LLM_NORM_RMS, il); + C = build_norm(C, model.layers[il].ssm_c_norm, nullptr, LLM_NORM_RMS, il); + dt = build_norm(dt, model.layers[il].ssm_dt_norm, nullptr, LLM_NORM_RMS, il); cb(B, "mamba_B_normed", il); cb(C, "mamba_C_normed", il); cb(dt, "mamba_dt_normed", il); @@ -17517,7 +17517,7 @@ struct llm_build_arcee : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -17585,15 +17585,15 @@ struct llm_build_arcee : public llm_graph_context { // feed-forward network // ARCEE uses relu^2 instead of silu cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + nullptr, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); @@ -17610,7 +17610,7 @@ struct llm_build_arcee : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -17652,7 +17652,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -17728,16 +17728,16 @@ struct llm_build_hunyuan_moe : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network (non-MoE) ggml_tensor * cur_mlp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur_mlp, "ffn_mlp", il); @@ -17772,7 +17772,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -17813,7 +17813,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -17888,15 +17888,15 @@ struct llm_build_hunyuan_dense : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network (non-MoE) ggml_tensor * cur_mlp = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur_mlp, "ffn_out", il); @@ -17911,7 +17911,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -17953,7 +17953,7 @@ struct llm_build_smollm3 : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -18020,15 +18020,15 @@ struct llm_build_smollm3 : public llm_graph_context { // feed-forward network { cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, + model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } @@ -18046,7 +18046,7 @@ struct llm_build_smollm3 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -18176,7 +18176,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -18206,7 +18206,7 @@ struct llm_build_lfm2 : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { auto * prev_cur = cur; - cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(cur, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "model.layers.{}.operator_norm", il); cur = hparams.is_recurrent(il) ? @@ -18222,7 +18222,7 @@ struct llm_build_lfm2 : public llm_graph_context { cur = ggml_add(ctx0, cur, build_feed_forward(cur, il)); } - cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.tok_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "model.embedding_norm", -1); res->t_embd = cur; @@ -18236,17 +18236,17 @@ struct llm_build_lfm2 : public llm_graph_context { ggml_tensor * build_feed_forward(ggml_tensor * cur, int il) const { - cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(cur, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "model.layers.{}.ffn_norm", il); GGML_ASSERT(!model.layers[il].ffn_up_b); GGML_ASSERT(!model.layers[il].ffn_gate_b); GGML_ASSERT(!model.layers[il].ffn_down_b); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "model.layers.{}.feed_forward.w2", il); @@ -18273,9 +18273,9 @@ struct llm_build_lfm2 : public llm_graph_context { v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens); // qk norm - q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + q = build_norm(q, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); cb(q, "model.layers.{}.self_attn.q_layernorm", il); - k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + k = build_norm(k, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(k, "model.layers.{}.self_attn.k_layernorm", il); // RoPE @@ -18290,7 +18290,7 @@ struct llm_build_lfm2 : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); - cur = build_attn(inp_attn, model.layers[il].wo, NULL, + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "model.layers.{}.self_attn.out_proj", il); @@ -18394,7 +18394,7 @@ struct llm_build_seed_oss : public llm_graph_context { // norm cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, + model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); @@ -18458,15 +18458,15 @@ struct llm_build_seed_oss : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].attn_post_norm, NULL, + model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -18483,7 +18483,7 @@ struct llm_build_seed_oss : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.output_norm, NULL, + model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -18534,7 +18534,7 @@ struct llm_build_smallthinker : public llm_graph_context{ cb(probs, "ffn_moe_logits", il); // norm - cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(inpL,model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention @@ -18579,7 +18579,7 @@ struct llm_build_smallthinker : public llm_graph_context{ cb(ffn_inp, "ffn_inp", il); // MoE branch - cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); ggml_tensor * ffn_out = @@ -18608,7 +18608,7 @@ struct llm_build_smallthinker : public llm_graph_context{ cur = inpL; - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1d0361cc16659..1fb858c373585 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -114,7 +114,7 @@ static void llama_tensor_dequantize_impl( const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type); if (ggml_is_quantized(tensor->type)) { - if (qtype->to_float == NULL) { + if (qtype->to_float == nullptr) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); } } else if (tensor->type != GGML_TYPE_F16 && diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 2186f827bf543..39b0b277e9d19 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2208,7 +2208,7 @@ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler llama_vocab dummy_vocab; // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying - auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0); + auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, nullptr, 0); // Copy the state, including the processed breakers { @@ -2285,7 +2285,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, // wrapper for test-sampling.cpp struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector>& seq_breakers) { llama_vocab dummy_vocab; - auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0); + auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, nullptr, 0); auto * ctx = (llama_sampler_dry *) result->ctx; // Process the token-based sequence breakers diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ca02b63a58407..f62f43cf27918 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -61,7 +61,7 @@ struct naive_trie { return &res->second; } - return NULL; + return nullptr; } std::map children; bool has_value; @@ -799,10 +799,10 @@ struct llm_tokenizer_ugm : llm_tokenizer { // escaped space symbol - U+2581 (Lower One Eighth Block) const std::string escaped_space = "\xE2\x96\x81"; - const char * prefix_replacements = NULL; + const char * prefix_replacements = nullptr; size_t prefix_replacements_size = 0; - const uint32_t * xcda_array = NULL; + const uint32_t * xcda_array = nullptr; size_t xcda_array_size = 0; struct naive_trie user_defined_token_matcher; @@ -859,7 +859,7 @@ struct llm_tokenizer_ugm_session { const struct best_tokenization & current_best = tokenization_results[input_offset]; const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]); - while (prefix_offset <= input_len && node != NULL) { + while (prefix_offset <= input_len && node != nullptr) { // check if we found valid token in prefix if (node->has_value) { // check if it corresponds to the whole UTF code point @@ -1176,7 +1176,7 @@ struct llm_tokenizer_rwkv_session { uint32_t position = 0; while (position < text.size()) { const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]); - if (node == NULL) { + if (node == nullptr) { // no matching token found, add unknown token output.push_back(vocab.token_unk()); position += 1; @@ -1186,7 +1186,7 @@ struct llm_tokenizer_rwkv_session { // traverse the trie to find the longest matching token uint32_t token_id = 0; uint32_t token_length = 0; - while (node != NULL) { + while (node != nullptr) { if (node->has_value) { token_id = node->value; token_length = position + 1; @@ -2550,7 +2550,7 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const { case LLAMA_VOCAB_TYPE_SPM: case LLAMA_VOCAB_TYPE_UGM: { auto buf = token_data.text.substr(3, 2); - return strtol(buf.c_str(), NULL, 16); + return strtol(buf.c_str(), nullptr, 16); } case LLAMA_VOCAB_TYPE_BPE: { GGML_ABORT("fatal error"); diff --git a/src/llama.cpp b/src/llama.cpp index f0d4f5f891cc7..0ce0ecbd92bcb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -71,7 +71,7 @@ void llama_backend_init(void) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL, false }; + struct ggml_init_params params = { 0, nullptr, false }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } @@ -158,7 +158,7 @@ static struct llama_model * llama_model_load_from_file_impl( } unsigned cur_percentage = 0; - if (params.progress_callback == NULL) { + if (params.progress_callback == nullptr) { params.progress_callback_user_data = &cur_percentage; params.progress_callback = [](float progress, void * ctx) { unsigned * cur_percentage_p = (unsigned *) ctx;