Enable and clean up compiler warnings in src (ikawrakow#824)

ikawrakow · Iwan Kawrakow · web-flow · commit 0ad1d3409005 · 2025-10-11T16:01:13.000+03:00
* WIP: enable and clean up warnings in src

* All warnings handled

---------

Co-authored-by: Iwan Kawrakow &lt;iwan.kawrakow@gmail.com&gt;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -11,6 +11,32 @@ endif()
 
 # llama
 
+if (LLAMA_ALL_WARNINGS)
+    if (NOT MSVC)
+        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-sign-compare)
+        if (APPLE)
+            # shut up c99 extensions warning I get on my system due to arm_neon.h
+            list(APPEND WARNING_FLAGS -Wno-c99-extensions)
+        endif()
+        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                  -Werror=implicit-int -Werror=implicit-function-declaration)
+        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+
+        list(APPEND C_FLAGS   ${WARNING_FLAGS})
+        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+    else()
+        # todo : msvc
+        set(C_FLAGS   "")
+        set(CXX_FLAGS "")
+    endif()
+endif()
+
+
 add_library(llama
             ../include/llama.h
             llama.cpp
@@ -34,9 +60,9 @@ add_library(llama
             unicode-data.cpp
             )
 
-target_include_directories(llama PUBLIC . ../include ../common)
+target_include_directories(llama PUBLIC . ../include)
 target_include_directories(llama PRIVATE ../ggml/src)
-target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
+target_compile_features   (llama PUBLIC cxx_std_17)
 
 target_link_libraries(llama PUBLIC ggml)
 
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
@@ -455,13 +455,11 @@ void llm_build_context::llm_build_kv_store(
                     int64_t   il) {
     const int64_t n_ctx = cparams.n_ctx;
 
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
     const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
 
-    const int64_t n_head        = hparams.n_head(il);
     const int64_t n_head_kv     = hparams.n_head_kv(il);
     const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_head_v = hparams.n_embd_head_v;
 
     GGML_ASSERT(kv.size == n_ctx);
 
@@ -957,7 +955,7 @@ static ggml_tensor * llm_build_kqv(
     const int64_t n_head        = hparams.n_head(il);
     const int64_t n_head_kv     = hparams.n_head_kv(il);
     const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(il);
+    //const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(il);
     const int64_t n_embd_head_v = hparams.n_embd_head_v;
     const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(il);
 
@@ -1082,7 +1080,7 @@ static ggml_tensor * llm_build_kqv(
             auto r2v = q->ne[2] / v->ne[2];
             n_step = q->ne[2];
             n_per_step = 1;
-            ggml_tensor * kqv;
+            ggml_tensor * kqv = nullptr;
             for (int i12 = 0; i12 < q->ne[2]; i12 += n_per_step) {
                 int this_ne12 = i12 + n_per_step <= q->ne[2] ? n_per_step : q->ne[2] - i12;
                 int i02 = i12/r2k;
@@ -5889,7 +5887,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
 
             if (lctx.cparams.mla_attn) {
 
-                ggml_tensor * kv_cache_trans;
+                ggml_tensor * kv_cache_trans = nullptr;
 
                 if (lctx.cparams.mla_attn == 1 && !lctx.cparams.flash_attn) {
                     ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, kv_lora_rank,
@@ -6018,9 +6016,9 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
                 }
                 else {
 
-                    ggml_tensor * kqv_compressed;
+                    ggml_tensor * kqv_compressed = nullptr;
 
-                    auto wkv_b = model.layers[il].wkv_b;
+                    //auto wkv_b = model.layers[il].wkv_b;
                     auto wk_b = model.layers[il].wk_b->ne[1] == kv_lora_rank ? model.layers[il].wk_b
                         : ggml_reshape_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head);
 
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
@@ -216,23 +216,23 @@ ggml_context * create_tensors_helper::ctx_for_layer_split(int i) const {
 }
 
 #define LOADING_PRELUDE \
-        const auto & hparams = model.hparams; \
-        const int64_t n_layer       = hparams.n_layer; \
-        const int64_t n_head        = hparams.n_head(); \
-        const int64_t n_head_kv     = hparams.n_head_kv(); \
-        const int64_t n_embd        = hparams.n_embd; \
-        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(); \
-        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(); \
-        const int64_t n_embd_head_k = hparams.n_embd_head_k; \
-        const int64_t n_embd_head_v = hparams.n_embd_head_v; \
-        const int64_t n_ff          = hparams.n_ff(); \
-        const int64_t n_embd_gqa    = n_embd_v_gqa; \
-        const int64_t n_vocab       = hparams.n_vocab; \
-        const int64_t n_vocab_type  = hparams.n_vocab_type; \
-        const int64_t n_rot         = hparams.n_rot; \
-        const int64_t n_expert      = hparams.n_expert; \
-        const int64_t n_expert_used = hparams.n_expert_used; \
-        const int64_t n_ctx_train   = hparams.n_ctx_train; \
+        [[maybe_unused]] const auto & hparams = model.hparams; \
+        [[maybe_unused]] const int64_t n_layer       = hparams.n_layer; \
+        [[maybe_unused]] const int64_t n_head        = hparams.n_head(); \
+        [[maybe_unused]] const int64_t n_head_kv     = hparams.n_head_kv(); \
+        [[maybe_unused]] const int64_t n_embd        = hparams.n_embd; \
+        [[maybe_unused]] const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(); \
+        [[maybe_unused]] const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(); \
+        [[maybe_unused]] const int64_t n_embd_head_k = hparams.n_embd_head_k; \
+        [[maybe_unused]] const int64_t n_embd_head_v = hparams.n_embd_head_v; \
+        [[maybe_unused]] const int64_t n_ff          = hparams.n_ff(); \
+        [[maybe_unused]] const int64_t n_embd_gqa    = n_embd_v_gqa; \
+        [[maybe_unused]] const int64_t n_vocab       = hparams.n_vocab; \
+        [[maybe_unused]] const int64_t n_vocab_type  = hparams.n_vocab_type; \
+        [[maybe_unused]] const int64_t n_rot         = hparams.n_rot; \
+        [[maybe_unused]] const int64_t n_expert      = hparams.n_expert; \
+        [[maybe_unused]] const int64_t n_expert_used = hparams.n_expert_used; \
+        [[maybe_unused]] const int64_t n_ctx_train   = hparams.n_ctx_train; \
         if (n_expert > 0 && hparams.n_expert_used == 0) { \
             throw std::runtime_error("model has expert layers but no expert layers are used"); \
         } \
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -450,8 +450,6 @@ void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array
 
     llama_sample_softmax_impl(nullptr, candidates);
 
-    auto cur_size = candidates->size;
-
     int pos_last = 0;
 
     for (size_t i = 0; i < candidates->size; ++i) {
@@ -471,7 +469,7 @@ void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array
 }
 
 void llama_sample_top_n_sigma_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float top_n_sigma) {
-    
+
     if (top_n_sigma <= 0.0f || candidates->size < 4) {
         // top_n_sigma <= 0: disabled
         // candidates->size < 4: no point in applying the transformation for fewer than 4 logits.
@@ -1132,14 +1130,15 @@ static void llama_sampler_grammar_free(struct llama_sampler* smpl) {
     delete ctx;
 }
 
-static struct llama_sampler_i llama_sampler_grammar_i = {
-    /* .name   = */ llama_sampler_grammar_name,
-    /* .accept = */ llama_sampler_grammar_accept_impl,
-    /* .apply  = */ llama_sampler_grammar_apply,
-    /* .reset  = */ llama_sampler_grammar_reset,
-    /* .clone  = */ NULL,
-    /* .free   = */ llama_sampler_grammar_free,
-};
+// ?
+//static struct llama_sampler_i llama_sampler_grammar_i = {
+//    /* .name   = */ llama_sampler_grammar_name,
+//    /* .accept = */ llama_sampler_grammar_accept_impl,
+//    /* .apply  = */ llama_sampler_grammar_apply,
+//    /* .reset  = */ llama_sampler_grammar_reset,
+//    /* .clone  = */ NULL,
+//    /* .free   = */ llama_sampler_grammar_free,
+//};
 
 struct llama_grammar* llama_sampler_init_grammar_impl(
     const struct llama_vocab* vocab,
@@ -1152,7 +1151,7 @@ struct llama_grammar* llama_sampler_init_grammar_impl(
     size_t num_trigger_tokens,
     const char** trigger_patterns,
     size_t num_trigger_patterns) {
-    auto* ctx = new llama_sampler_grammar;
+    // Huh? this is not used and leaks. auto* ctx = new llama_sampler_grammar;
     struct llama_grammar* grammar;
     if (grammar_str != nullptr && grammar_str[0] != '\0') {
         // TODO: remove trigger_words support.
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
diff --git a/src/llama.cpp b/src/llama.cpp