MaggotHATE
diff --git a/‎Makefile‎
Lines changed: 1 addition & 0 deletions b/‎Makefile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎base_sampling2/chat_layer.h‎
Lines changed: 50 additions & 4 deletions b/‎base_sampling2/chat_layer.h‎
Lines changed: 50 additions & 4 deletions
diff --git a/‎base_sampling2/common.h‎
Lines changed: 6 additions & 1 deletion b/‎base_sampling2/common.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎base_sampling2/include/jsonParams.h‎
Lines changed: 5 additions & 0 deletions b/‎base_sampling2/include/jsonParams.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎base_sampling2/llama-addon.cpp‎
Lines changed: 1 addition & 1 deletion b/‎base_sampling2/llama-addon.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎base_sampling2/master/ggml/include/ggml.h‎
Lines changed: 9 additions & 0 deletions b/‎base_sampling2/master/ggml/include/ggml.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-backend.cpp‎
Lines changed: 3 additions & 0 deletions b/‎base_sampling2/master/ggml/src/ggml-backend.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 23 additions & 21 deletions b/‎base_sampling2/master/ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 23 additions & 21 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp‎
Lines changed: 2 additions & 2 deletions b/‎base_sampling2/master/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎base_sampling2/master/ggml/src/ggml-cpu/ggml-cpu-impl.h‎
Lines changed: 5 additions & 9 deletions b/‎base_sampling2/master/ggml/src/ggml-cpu/ggml-cpu-impl.h‎
Lines changed: 5 additions & 9 deletions
@@ -473,6 +473,7 @@ OBJS_GGUF_LLAMA = \
     $(TMP)$(PREFIX)_llama-batch.o \
     $(TMP)$(PREFIX)_llama-chat.o \
     $(TMP)$(PREFIX)_llama-context.o \
+    $(TMP)$(PREFIX)_llama-cparams.o \
     $(TMP)$(PREFIX)_llama-grammar.o \
     $(TMP)$(PREFIX)_llama-graph.o \
     $(TMP)$(PREFIX)_llama-hparams.o \
 
@@ -596,6 +596,45 @@ class chat
         // std::getline(std::cin, pause);
     }
 
+    bool logit_bias_check_exact(std::string_view token_str) {
+        for (auto word : params.sparams.logit_bias_strings_exact) {
+            if (token_str == word) return true;
+        }
+
+        return false;
+    }
+
+    bool logit_bias_check_beginning(std::string_view token_str) {
+        for (auto word : params.sparams.logit_bias_strings_beginning) {
+            if ((token_str.find(word) == 0 && (token_str.length() - word.length()) < 4) ||
+                (token_str.length() > 2 && word.find(token_str) == 0)
+                ) return true;
+        }
+
+        return false;
+    }
+
+    bool logit_bias_check_ending(std::string_view token_str) {
+        for (auto word : params.sparams.logit_bias_strings_ending) {
+            auto token_str_pos = word.find(token_str);
+            if (token_str_pos == (token_str.length() - 1)) return true;
+        }
+
+        return false;
+    }
+
+   bool logit_bias_checks(std::string token_str) {
+        if (token_str.front() == ' ') {
+           token_str = token_str.substr(1);
+        }
+
+        if (token_str.back() == ' ') {
+            token_str.pop_back();
+        }
+
+        return logit_bias_check_exact(token_str) || logit_bias_check_beginning(token_str) || logit_bias_check_ending(token_str);
+   }
+
    void logit_bias_postfill(llama_token & id, std::string token_str) {
         // cutting spaces since there are "duplicated" tokens with them
         if (token_str.front() == ' ') {
@@ -687,14 +726,21 @@ class chat
     }
 
     void processByVocab(std::string safeguard_string) {
+        bool has_logit_biases_detailed = (params.sparams.logit_bias_strings_exact.size() || params.sparams.logit_bias_strings_beginning.size() || params.sparams.logit_bias_strings_ending.size());
+
         bool has_logit_biases = (params.sparams.logit_bias_strings.size() || params.sparams.logit_bias_strings_ext.size());
         bool has_logit_biases_start = params.sparams.logit_bias_strings_start.size();
 
         for (llama_token id = 0; id < llama_vocab_n_tokens(vocab); id++) {
             std::string token_str = common_token_to_piece(ctx, id);
 
-            if (has_logit_biases) logit_bias_postfill(id, token_str);
-            if (has_logit_biases_start) start_bias_tokens_postfill(id, token_str);
+            if (has_logit_biases_detailed == true && logit_bias_checks(token_str) == true) {
+                params.sparams.logit_bias.push_back({id, -INFINITY});
+            } else if (has_logit_biases == true) {
+                logit_bias_postfill(id, token_str);
+            }
+
+            if (has_logit_biases_start == true) start_bias_tokens_postfill(id, token_str);
             if (safeguard_token < 0) get_safeguard_token(id, token_str, safeguard_string);
         }
 
@@ -1636,7 +1682,7 @@ class chat
                         if (id == l) {
                             checks = 0;
                             std::string c_restricted_tkn_string = common_token_to_piece(ctx, id);
-                            writeTextFile("logit_biasing.txt", std::format("Found: '{}';", c_restricted_tkn_string));
+                            writeTextFile("logit_biasing.txt", std::format("{}: Found '{}';", params.sparams.seed, c_restricted_tkn_string));
 
                             id = common_sampler_shift(smpl, ctx, -1, id);
 
@@ -1659,7 +1705,7 @@ class chat
                     // --attempts;
                 // }
 
-                if (biased_logit.token == id) {
+                if (biased_logit.bias < -9 && biased_logit.token == id) {
                     ++c_restricted_tkns;
                     // std::string c_restricted_tkn_string = common_token_to_piece(ctx, id);
                     // writeTextFile("logit_biasing.txt", std::format("+{}\n", c_restricted_tkn_string));
 
@@ -195,7 +195,12 @@ struct common_params_sampling {
 
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
 
-    std::vector<std::string> logit_bias_strings; // words for logit biases
+    std::vector<std::string> logit_bias_strings; // words for logit biases, all matches
+    std::vector<std::string> logit_bias_strings_exact; // words for logit biases, exact matches
+    std::vector<std::string> logit_bias_strings_beginning; // words for logit biases, beginning of the word matches
+    std::vector<std::string> logit_bias_strings_ending; // words for logit biases, ending of the word matches
+
+
     std::map<std::string, float> logit_bias_strings_ext; // words for logit biases, but with extra configuration
     std::vector<std::string> logit_bias_strings_start; // restricted beginnings of messages
 
 
@@ -533,6 +533,11 @@ static void getSamplingParamsFromJson(nlohmann::json& config, common_params& par
 
 // logit_bias_strings
     if (checkJArr(config, "logit_bias_strings")) params.sparams.logit_bias_strings = config["logit_bias_strings"];
+    if (checkJArr(config, "logit_bias_strings_exact")) params.sparams.logit_bias_strings_exact = config["logit_bias_strings_exact"];
+    if (checkJArr(config, "logit_bias_strings_beginning")) params.sparams.logit_bias_strings_beginning = config["logit_bias_strings_beginning"];
+    if (checkJArr(config, "logit_bias_strings_ending")) params.sparams.logit_bias_strings_ending = config["logit_bias_strings_ending"];
+
+
     if (checkJObj(config, "logit_bias_strings_ext")) params.sparams.logit_bias_strings_ext = config["logit_bias_strings_ext"];
     if (checkJArr(config, "logit_bias_strings_start")) params.sparams.logit_bias_strings_start = config["logit_bias_strings_start"];
 
 
@@ -674,7 +674,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t
         }
 
         // if we have enough values the operation was a success
-        if (filtered_tokens.size() >= ctx->min_keep) {
+        if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
             memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
             cur_p->size = filtered_tokens.size();
             min_p_applied = true;
 
@@ -935,6 +935,15 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // repeat a to the specified shape
+    GGML_API struct ggml_tensor * ggml_repeat_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+                       int64_t    ne0,
+                       int64_t    ne1,
+                       int64_t    ne2,
+                       int64_t    ne3);
+
     // sums repetitions in a into shape of b
     GGML_API struct ggml_tensor * ggml_repeat_back(
             struct ggml_context * ctx,
 
@@ -1598,6 +1598,9 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_backends; i++) {
         ggml_backend_synchronize(sched->backends[i]);
     }
+    // reset the current copy to 0 so that the graphs will be similar during generation
+    // necessary for CUDA graphs
+    sched->cur_copy = 0;
 }
 
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
 
@@ -299,6 +299,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 endif()
             endif()
         endif()
+
+        if (GGML_BACKEND_DL)
+            if (GGML_NATIVE)
+                # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
+                message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
+            endif()
+
+            # The feature detection code is compiled as a separate target so that
+            # it can be built without the architecture flags
+            # Since multiple variants of the CPU backend may be included in the same
+            # build, using set_source_files_properties() to set the arch flags is not possible
+            set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
+            add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
+            target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+            target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
+            target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+            set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
+        endif()
     elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
         message(STATUS "PowerPC detected")
         if (GGML_NATIVE)
@@ -338,8 +357,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
     elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
         message(STATUS "RISC-V detected")
         if (GGML_RVV)
-            if (GGML_RV_ZFH)
-                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
+            if (GGML_XTHEADVECTOR)
+                list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
+            elseif (GGML_RV_ZFH)
+                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
             else()
                 list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
             endif()
@@ -477,25 +498,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
     target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
     target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
 
-    if (GGML_BACKEND_DL)
-        if (GGML_NATIVE)
-            # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
-            message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
-        endif()
-
-        # The feature detection code is compiled as a separate target so that
-        # it can be built without the architecture flags
-        # Since multiple variants of the CPU backend may be included in the same
-        # build, using set_source_files_properties() to set the arch flags is not possible
-        set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
-        add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
-        target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
-        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
-        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
-        set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
-    endif()
-
     if (EMSCRIPTEN)
         set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
     endif()
 
@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
         }
     }
     return;
-#elif defined(__riscv_v_intrinsic)
+#elif defined __riscv_v
     if (__riscv_vlenb() >= QK4_0) {
         const size_t vl = QK4_0;
 
@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
         }
         return;
     }
-#elif defined(__riscv_v_intrinsic)
+#elif defined __riscv_v
     if (__riscv_vlenb() >= QK4_0) {
         const size_t vl = QK4_0;
 
 
@@ -320,21 +320,17 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
-#else
+#endif
+
 #ifdef __POWER9_VECTOR__
 #include <altivec.h>
-#else
+#endif
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
-#if !defined(__riscv)
+#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
 #include <immintrin.h>
 #endif
-#endif
-#endif
-#endif
-#endif
 
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
Original file line number	Diff line number	Diff line change
`@@ -674,7 +674,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t`
`674`	`674`	`}`
`675`	`675`
`676`	`676`	`// if we have enough values the operation was a success`
`677`		`- if (filtered_tokens.size() >= ctx->min_keep) {`
	`677`	`+ if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {`
`678`	`678`	`memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));`
`679`	`679`	`cur_p->size = filtered_tokens.size();`
`680`	`680`	`min_p_applied = true;`
Original file line number	Diff line number	Diff line change
`@@ -1598,6 +1598,9 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {`
`1598`	`1598`	`for (int i = 0; i < sched->n_backends; i++) {`
`1599`	`1599`	`ggml_backend_synchronize(sched->backends[i]);`
`1600`	`1600`	`}`
	`1601`	`+ // reset the current copy to 0 so that the graphs will be similar during generation`
	`1602`	`+ // necessary for CUDA graphs`
	`1603`	`+ sched->cur_copy = 0;`
`1601`	`1604`	`}`
`1602`	`1605`
`1603`	`1606`	`void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {`
Original file line number	Diff line number	Diff line change
`@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c`
`1191`	`1191`	`}`
`1192`	`1192`	`}`
`1193`	`1193`	`return;`
`1194`		`-#elif defined(__riscv_v_intrinsic)`
	`1194`	`+#elif defined __riscv_v`
`1195`	`1195`	`if (__riscv_vlenb() >= QK4_0) {`
`1196`	`1196`	`const size_t vl = QK4_0;`
`1197`	`1197`
`@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c`
`3783`	`3783`	`}`
`3784`	`3784`	`return;`
`3785`	`3785`	`}`
`3786`		`-#elif defined(__riscv_v_intrinsic)`
	`3786`	`+#elif defined __riscv_v`
`3787`	`3787`	`if (__riscv_vlenb() >= QK4_0) {`
`3788`	`3788`	`const size_t vl = QK4_0;`
`3789`	`3789`