kobo cheats death again (+1 squashed commits)

LostRuins · LostRuins · commit b4dc29f42579 · 2025-01-04T01:06:41.000+08:00
Squashed commits:

[708e2429] kobo cheats death again
diff --git a/Makefile b/Makefile
@@ -626,7 +626,7 @@ whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
 
 # idiotic "for easier compilation"
-GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-vocab.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
+GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
 gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
 gpttype_adapter.o: $(GPTTYPE_ADAPTER)
diff --git a/include/llama.h b/include/llama.h
@@ -631,8 +631,6 @@ extern "C" {
                        llama_pos   p0,
                        llama_pos   p1);
 
-   LLAMA_API void printcache(struct llama_context * ctx);
-
     // Copy all tokens that belong to the specified sequence to another sequence
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp
@@ -28,6 +28,19 @@
 
 #define ST_HEADER_SIZE_LEN 8
 
+static std::string format(const char* fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
 uint64_t read_u64(uint8_t* buffer) {
     // little endian
     uint64_t value = 0;
diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp
@@ -62,19 +62,6 @@ void replace_all_chars(std::string& str, char target, char replacement) {
     }
 }
 
-std::string format(const char* fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
 #ifdef _WIN32  // code for windows
 #include <windows.h>
 
diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h
@@ -11,8 +11,6 @@ bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 bool contains(const std::string& str, const std::string& substr);
 
-std::string format(const char* fmt, ...);
-
 void replace_all_chars(std::string& str, char target, char replacement);
 
 bool file_exists(const std::string& filename);
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
@@ -374,6 +374,7 @@ struct llama_mmap::impl {
             throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
         }
 
+#ifndef USE_FAILSAFE
         if (prefetch > 0) {
 #if _WIN32_WINNT >= 0x602
             BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
@@ -394,6 +395,9 @@ struct llama_mmap::impl {
             throw std::runtime_error("PrefetchVirtualMemory unavailable");
 #endif
         }
+#else
+printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n");
+#endif
     }
 
     void unmap_fragment(size_t first, size_t last) {
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -7,6 +7,10 @@
 #include <cstring>
 #include <future>
 
+#if defined(GGML_USE_CLBLAST)
+#  include "ggml_v3b-opencl.h"
+#endif
+
 const char * llama_file_version_name(llama_fver version) {
     switch (version) {
         case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -479,6 +483,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
 
     // determine file type based on the number of tensors for each quantization and print meta data
     // TODO: make optional
+    if(false) //disable this log for now
     {
         std::map<enum ggml_type, uint32_t> n_type;
 
@@ -776,6 +781,24 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
     }
 }
 
+static int clblast_offload_fallback_layers = 0;
+static int layer_name_to_number(std::string inputString)
+{
+    size_t firstDotPosition = inputString.find('.');
+    int converted = -1;
+
+    if (firstDotPosition != std::string::npos) {
+        size_t secondDotPosition = inputString.find('.', firstDotPosition + 1);
+        if (secondDotPosition != std::string::npos) {
+            std::string numbersPortion = inputString.substr(firstDotPosition + 1, secondDotPosition - firstDotPosition - 1);
+            try{converted = std::stoi(numbersPortion);}
+            catch (const std::invalid_argument& e) {}
+            catch (const std::out_of_range& e) {}
+        }
+    }
+    return converted;
+}
+
 bool llama_model_loader::load_all_data(
         struct ggml_context * ctx,
         llama_buf_map & bufs,
@@ -960,6 +983,16 @@ bool llama_model_loader::load_all_data(
             }
         }
 
+        #if defined(GGML_USE_CLBLAST)
+        int layernum = layer_name_to_number(cur->name);
+        bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum);
+        if(shouldoffload)
+        {
+            cur->backend = GGML_BACKEND_TYPE_GPU;
+            ggml_cl_transform_tensor(cur->data, cur);
+        }
+        #endif
+
         size_done += n_size;
     }
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -11,6 +11,10 @@
 #include <sstream>
 #include <stdexcept>
 
+#if defined(GGML_USE_CLBLAST)
+#  include "ggml_v3b-opencl.h"
+#endif
+
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
@@ -150,6 +154,9 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d
         throw std::runtime_error(format("failed to create ggml context"));
     }
 
+    #if defined(GGML_USE_CLBLAST)
+    ggml_cl_init();
+    #endif
     ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
     ggml_tensor * op_tensor = fn(ctx.get());
     for (int i = 0; i < GGML_MAX_SRC; i++) {
@@ -1153,6 +1160,16 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
             const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
             for (int i = 0; i < n_merges; i++) {
                 const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+                if (!OldBPETokenizerMode)
+                {
+                    auto validcodepoints = unicode_cpts_from_utf8(word).size() > 0;
+                    GGML_ASSERT_CONTINUE(validcodepoints);
+                    if(!validcodepoints)
+                    {
+                        OldBPETokenizerMode = true;
+                        printf("\nFalling Back to older tokenizer...");
+                    }
+                }
                 GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
 
                 std::string first;
@@ -1398,10 +1415,13 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
 
     for (uint32_t i = 0; i < n_vocab; i++) {
         std::string word = gguf_get_arr_str(ctx, token_idx, i);
-        if (word.empty()) {
+       if (!OldBPETokenizerMode)
+        {
+            if (word.empty()) {
             LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
             word = "[EMPTY_" + std::to_string(i) + "]";
         }
+        }
 
         vocab.token_to_id[word] = i;
         vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
@@ -1424,7 +1444,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
             }
         }
     }
-    GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
+    GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size());
 
     vocab.init_tokenizer();
 
@@ -1681,8 +1701,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
             } else {
                 // token is control, but not marked as EOG -> print a debug log
                 if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
-                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                            __func__, t.second, t.first.c_str());
+                    // LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                    //         __func__, t.second, t.first.c_str());
                 }
             }
         }
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -394,7 +394,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             case GGML_TYPE_IQ1_M:
             case GGML_TYPE_Q2_K:
             case GGML_TYPE_Q3_K:
-            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_Q4_0;   break;
             case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
             case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
             case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
diff --git a/src/llama.cpp b/src/llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -374,6 +374,7 @@ struct llama_mmap::impl {`
`374`	`374`	`throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));`
`375`	`375`	`}`
`376`	`376`
	`377`	`+#ifndef USE_FAILSAFE`
`377`	`378`	`if (prefetch > 0) {`
`378`	`379`	`#if _WIN32_WINNT >= 0x602`
`379`	`380`	`BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);`
`@@ -394,6 +395,9 @@ struct llama_mmap::impl {`
`394`	`395`	`throw std::runtime_error("PrefetchVirtualMemory unavailable");`
`395`	`396`	`#endif`
`396`	`397`	`}`
	`398`	`+#else`
	`399`	`+printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n");`
	`400`	`+#endif`
`397`	`401`	`}`
`398`	`402`
`399`	`403`	`void unmap_fragment(size_t first, size_t last) {`