MaggotHATE
diff --git a/‎base_sampling2/chat_layer.h‎
Lines changed: 29 additions & 11 deletions b/‎base_sampling2/chat_layer.h‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎base_sampling2/common.cpp‎
Lines changed: 2 additions & 2 deletions b/‎base_sampling2/common.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎base_sampling2/include/jsonParams.h‎
Lines changed: 34 additions & 6 deletions b/‎base_sampling2/include/jsonParams.h‎
Lines changed: 34 additions & 6 deletions
diff --git a/‎base_sampling2/llama-addon.cpp‎
Lines changed: 6 additions & 2 deletions b/‎base_sampling2/llama-addon.cpp‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎base_sampling2/master/ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎base_sampling2/master/ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 0 additions & 2 deletions b/‎base_sampling2/master/ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 2 additions & 2 deletions b/‎base_sampling2/master/ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-opencl/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎base_sampling2/master/ggml/src/ggml-opencl/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
@@ -73,6 +73,8 @@ extern int num_probs_bottoms;
 
 extern float confidence_total;
 
+extern std::vector<llama_token> last_candidates_logits;
+
 #define SESSIONS_FOLDER "sessions/"
 
 static common_params paramsDefault;
@@ -190,6 +192,7 @@ class chat
 private:
 
     llama_context * ctx = nullptr;
+    llama_memory_t mem = nullptr;
     llama_model * model = nullptr;
     common_sampler * smpl = nullptr;
     const llama_vocab * vocab = nullptr;
@@ -289,6 +292,8 @@ class chat
     std::string logit_bias_strings_ext_display = "";
     std::string logit_bias_strings_start_display = "";
 
+    std::string last_candidates_logits_display = "";
+
     struct llama_perf_context_data ctx_performance_data;
 
     //std::map<std::string,std::string> stats;
@@ -765,6 +770,14 @@ class chat
         }
     }
 
+    void get_last_candidates_logits_display() {
+        last_candidates_logits_display.clear();
+
+        for (auto logit : last_candidates_logits) {
+            last_candidates_logits_display += std::format("{}; ", common_token_to_piece(ctx, logit));
+        }
+    }
+
     void params_postfill() {
         if (params.kv_overrides_pair.size()) kv_override_prefill();
         common_process_override_tensors(params);
@@ -1250,6 +1263,9 @@ class chat
             ctx = llama_init.context.release();
             printf("..............CONTEXT INITIALIZED (%s)................\n", __func__);
 
+            mem = llama_get_memory(ctx);
+            printf("..............MEM INITIALIZED (%s)................\n", __func__);
+
             assignThreads();
             printf("..............THREADS ASSIGNED (%s)................\n", __func__);
 
@@ -1402,7 +1418,7 @@ class chat
 
             // remove any "future" tokens that we might have inherited from the previous session
             //llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
-            llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+            llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1);
         }
 
         // if we will use the cache for the full prompt without reaching the end of the cache, force
@@ -1475,8 +1491,8 @@ class chat
             // always keep the first token - BOS
             //n_past = std::max(1, params.n_keep);
             //n_past_guidance = std::max(1, params.n_keep + guidance_offset);
-            llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-            llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+            llama_memory_seq_rm (mem, 0, params.n_keep            , params.n_keep + n_discard);
+            llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard);
 
             // insert n_left/2 tokens at the start of embd from last_n_tokens
             //embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -1510,8 +1526,8 @@ class chat
                 const int n_left    = n_past - params.n_keep;
                 const int n_discard = n_left/2;
 
-                llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                llama_memory_seq_rm (mem, 0, params.n_keep            , params.n_keep + n_discard);
+                llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
@@ -1524,9 +1540,9 @@ class chat
                 const int bd = (ga_w/ga_n)*(ga_n - 1);
                 const int dd = (ga_w/ga_n) - ib*bd - ga_w;
 
-                llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                llama_kv_self_seq_div  (ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                llama_memory_seq_add(mem, 0, ga_i,                n_past,              ib*bd);
+                llama_memory_seq_div(mem, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                 n_past -= bd;
 
@@ -1650,6 +1666,8 @@ class chat
         // const llama_token id = common_sampler_sample(smpl, ctx, -1);
         llama_token id = common_sampler_sample(smpl, ctx, -1);
 
+        get_last_candidates_logits_display();
+
         // try to sample a different token to avoid empty messages
         int attempts = 1000; // safeguard
         while (emptyMessage == true && llama_token_is_eog(vocab, id) && attempts > 0) {
@@ -1738,7 +1756,7 @@ class chat
             capture_smpl();
             // rewind_state.capture_kv_cache(llama_kv_cache_seq_pos_max(ctx, 0));
             // rewind_state.capture_kv_cache(llama_kv_self_seq_pos_max(ctx, -1));
-            rewind_state.capture_kv_cache(llama_kv_self_seq_pos_max(ctx, 0));
+            rewind_state.capture_kv_cache(llama_memory_seq_pos_max(mem, 0));
             rewind_state.capture_embd_inp(embd_inp.size());
             rewind_state.capture_n_past(n_past);
             rewind_state.capture_n_consumed(n_consumed);
@@ -1748,7 +1766,7 @@ class chat
     int get_kv_cache_seq_pos_max() {
         // return llama_kv_cache_seq_pos_max(ctx, 0);
         // return llama_kv_self_seq_pos_max(ctx, -1);
-        return llama_kv_self_seq_pos_max(ctx, 0);
+        return llama_memory_seq_pos_max(mem, 0);
     }
 
     void clearStates2() {
@@ -1764,7 +1782,7 @@ class chat
         restore_smpl();
         //common_sampler_reset(smpl);
     // context
-        llama_kv_self_seq_rm(ctx, 0, rewind_state.kv_cache_pos, -1);
+        llama_memory_seq_rm(mem, 0, rewind_state.kv_cache_pos, -1);
         // llama_kv_self_seq_rm(ctx, -1, rewind_state.kv_cache_pos, -1);
         // llama_kv_cache_update(ctx);
     // chat parameters
 
@@ -875,7 +875,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
+    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
         printf("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
         llama_model_free(model);
         return iparams;
@@ -982,7 +982,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_self_clear(lctx);
+        llama_memory_clear(llama_get_memory(lctx), true);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
         llama_set_warmup(lctx, false);
 
@@ -46,6 +46,32 @@ static std::string extract_string_mod(std::string& text, std::string open, std::
     return "NULL";
 }
 
+static void extract_logit_bias_strings(std::string& text, std::string open, std::string close, nlohmann::json& config) {
+    size_t open_pos = text.rfind(open);
+    size_t close_pos = text.rfind(close);
+    if (open_pos != text.npos && close_pos != text.npos) {
+        size_t diff = close_pos - open_pos - open.length();
+        std::string extract = text.substr(open_pos + open.length(), diff);
+        std::cout << "Extracting: " << extract << std::endl;
+        text.replace(open_pos,diff + open.length() + close.length(),"");
+
+        if (extract.size() > 1) {
+            int pos = 0;
+            for (int i = 0; i < extract.size(); i++) {
+                if (extract[i] == ',') {
+                    std::string part = extract.substr(pos, i);
+                    config["logit_bias_strings_exact"].push_back(part);
+                    std::cout << "Adding " << part << std::endl;
+                    pos = i+1;
+                }
+            }
+        } else if (extract.size() > 0) {
+            config["logit_bias_strings_exact"].push_back(extract);
+            std::cout << "Adding " << extract << std::endl;
+        }
+    }
+}
+
 static bool replace_string_mod(std::string& text, std::string target, std::string replacement) {
     size_t target_pos = text.rfind(target);
 
@@ -621,9 +647,10 @@ static void getPerformanceParamsFromJson(nlohmann::json& config, common_params&
     if (checkJNum(config, "n_threads_sched_priority")) {
         int sched_priority = config["n_threads_sched_priority"];
         switch (sched_priority){
-            case 1: params.cpuparams.priority = GGML_SCHED_PRIO_MEDIUM; break;
-            case 2: params.cpuparams.priority = GGML_SCHED_PRIO_HIGH; break;
-            case 3: params.cpuparams.priority = GGML_SCHED_PRIO_REALTIME; break;
+            case 1: params.cpuparams.priority = GGML_SCHED_PRIO_LOW; break;
+            case 2: params.cpuparams.priority = GGML_SCHED_PRIO_MEDIUM; break;
+            case 3: params.cpuparams.priority = GGML_SCHED_PRIO_HIGH; break;
+            case 4: params.cpuparams.priority = GGML_SCHED_PRIO_REALTIME; break;
             default: params.cpuparams.priority = GGML_SCHED_PRIO_NORMAL; break;
         }
     }
@@ -634,9 +661,10 @@ static void getPerformanceParamsFromJson(nlohmann::json& config, common_params&
     if (checkJNum(config, "n_threads_batch_sched_priority")) {
         int sched_priority = config["n_threads_batch_sched_priority"];
         switch (sched_priority){
-            case 1: params.cpuparams_batch.priority = GGML_SCHED_PRIO_MEDIUM; break;
-            case 2: params.cpuparams_batch.priority = GGML_SCHED_PRIO_HIGH; break;
-            case 3: params.cpuparams_batch.priority = GGML_SCHED_PRIO_REALTIME; break;
+            case 1: params.cpuparams_batch.priority = GGML_SCHED_PRIO_LOW; break;
+            case 2: params.cpuparams_batch.priority = GGML_SCHED_PRIO_MEDIUM; break;
+            case 3: params.cpuparams_batch.priority = GGML_SCHED_PRIO_HIGH; break;
+            case 4: params.cpuparams_batch.priority = GGML_SCHED_PRIO_REALTIME; break;
             default: params.cpuparams_batch.priority = GGML_SCHED_PRIO_NORMAL; break;
         }
     }
 
@@ -46,6 +46,7 @@ float xtc_percent = 0.0;
 int candidates_max = 0;
 int candidates_max_min_p = 0;
 std::string last_candidates = "NONE";
+std::vector<llama_token> last_candidates_logits;
 
 int rx_total = 0;
 int rx_removed = 0;
@@ -154,13 +155,16 @@ static bool writeCandidatesToFile2vec(std::string path, std::vector<llama_token_
 
 static std::string getFormattedCandidates(llama_token_data_array * candidates) {
     std::string text = "(" + std::to_string(candidates->size) + "): ";
+    last_candidates_logits.clear();
     int zeroes = 0;
     for (size_t i = 0; i < candidates->size; ++i) {
         int chance = candidates->data[i].p * 100;
         int logit = candidates->data[i].logit;
-        if (chance > 0 || candidates->size == 1) { 
-            text += " #" + std::to_string(i) +"[" + std::to_string(chance) + "%|" + std::to_string(logit) + "]"; 
+        if (chance > 0 || candidates->size == 1) {
+            text += " #" + std::to_string(i) +"[" + std::to_string(chance) + "%|" + std::to_string(logit) + "]";
         } else ++zeroes;
+
+        last_candidates_logits.push_back(candidates->data[i].id);
     }
     //if (zeroes > 0) text += "~" + std::to_string(zeroes);
 
 
@@ -318,7 +318,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
             endif()
 
-            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
+            string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
+            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
             string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
 
             if (EXTRACTED_NUMBER GREATER_EQUAL 10)
 
@@ -2430,8 +2430,6 @@ static bool ggml_thread_apply_priority(int32_t prio) {
         // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
         // all our threads onto the first 4 cores which results in terrible performance with
         // n_threads > 4
-        // MinGW doesn't support THREAD_POWER_THROTTLING_CURRENT_VERSION
-        // and THREAD_POWER_THROTTLING_EXECUTION_SPEED
         #if !defined(__GNUC__) && _WIN32_WINNT >= 0x0602
         THREAD_POWER_THROTTLING_STATE t;
         ZeroMemory(&t, sizeof(t));
 
@@ -8132,8 +8132,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         #define WKV_VECTOR_SIZE 4
     #endif
 
-    int wkv_vector_size;
     #ifdef WKV_VECTOR_SIZE
+        int wkv_vector_size;
         #if defined(__ARM_FEATURE_SVE)
             wkv_vector_size = svcntw();
         #else
@@ -8348,8 +8348,8 @@ static void ggml_compute_forward_gla_f32(
         #define GLA_VECTOR_SIZE 4
     #endif
 
-    int gla_vector_size;
     #ifdef GLA_VECTOR_SIZE
+        int gla_vector_size;
         #if defined(__ARM_FEATURE_SVE)
             gla_vector_size = svcntw();
         #else
 
@@ -95,6 +95,12 @@ set(GGML_OPENCL_KERNELS
     sub
     sum_rows
     transpose
+    concat
+    tsembd
+    upscale
+    tanh
+    pad
+    repeat
 )
 
 foreach (K ${GGML_OPENCL_KERNELS})