Skip to content

Commit b4dc29f

Browse files
committed
kobo cheats death again (+1 squashed commits)
Squashed commits: [708e2429] kobo cheats death again
1 parent f9f1585 commit b4dc29f

File tree

10 files changed

+225
-46
lines changed

10 files changed

+225
-46
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ whispercpp_cublas.o: otherarch/whispercpp/whisper_adapter.cpp
626626
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
627627

628628
# idiotic "for easier compilation"
629-
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-vocab.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
629+
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
630630
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
631631
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
632632
gpttype_adapter.o: $(GPTTYPE_ADAPTER)

include/llama.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -631,8 +631,6 @@ extern "C" {
631631
llama_pos p0,
632632
llama_pos p1);
633633

634-
LLAMA_API void printcache(struct llama_context * ctx);
635-
636634
// Copy all tokens that belong to the specified sequence to another sequence
637635
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
638636
// p0 < 0 : [0, p1]

otherarch/sdcpp/model.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@
2828

2929
#define ST_HEADER_SIZE_LEN 8
3030

31+
static std::string format(const char* fmt, ...) {
32+
va_list ap;
33+
va_list ap2;
34+
va_start(ap, fmt);
35+
va_copy(ap2, ap);
36+
int size = vsnprintf(NULL, 0, fmt, ap);
37+
std::vector<char> buf(size + 1);
38+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
39+
va_end(ap2);
40+
va_end(ap);
41+
return std::string(buf.data(), size);
42+
}
43+
3144
uint64_t read_u64(uint8_t* buffer) {
3245
// little endian
3346
uint64_t value = 0;

otherarch/sdcpp/util.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -62,19 +62,6 @@ void replace_all_chars(std::string& str, char target, char replacement) {
6262
}
6363
}
6464

65-
std::string format(const char* fmt, ...) {
66-
va_list ap;
67-
va_list ap2;
68-
va_start(ap, fmt);
69-
va_copy(ap2, ap);
70-
int size = vsnprintf(NULL, 0, fmt, ap);
71-
std::vector<char> buf(size + 1);
72-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
73-
va_end(ap2);
74-
va_end(ap);
75-
return std::string(buf.data(), size);
76-
}
77-
7865
#ifdef _WIN32 // code for windows
7966
#include <windows.h>
8067

otherarch/sdcpp/util.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ bool ends_with(const std::string& str, const std::string& ending);
1111
bool starts_with(const std::string& str, const std::string& start);
1212
bool contains(const std::string& str, const std::string& substr);
1313

14-
std::string format(const char* fmt, ...);
15-
1614
void replace_all_chars(std::string& str, char target, char replacement);
1715

1816
bool file_exists(const std::string& filename);

src/llama-mmap.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ struct llama_mmap::impl {
374374
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
375375
}
376376

377+
#ifndef USE_FAILSAFE
377378
if (prefetch > 0) {
378379
#if _WIN32_WINNT >= 0x602
379380
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
@@ -394,6 +395,9 @@ struct llama_mmap::impl {
394395
throw std::runtime_error("PrefetchVirtualMemory unavailable");
395396
#endif
396397
}
398+
#else
399+
printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n");
400+
#endif
397401
}
398402

399403
void unmap_fragment(size_t first, size_t last) {

src/llama-model-loader.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
#include <cstring>
88
#include <future>
99

10+
#if defined(GGML_USE_CLBLAST)
11+
# include "ggml_v3b-opencl.h"
12+
#endif
13+
1014
const char * llama_file_version_name(llama_fver version) {
1115
switch (version) {
1216
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -479,6 +483,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
479483

480484
// determine file type based on the number of tensors for each quantization and print meta data
481485
// TODO: make optional
486+
if(false) //disable this log for now
482487
{
483488
std::map<enum ggml_type, uint32_t> n_type;
484489

@@ -776,6 +781,24 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
776781
}
777782
}
778783

784+
static int clblast_offload_fallback_layers = 0;
785+
static int layer_name_to_number(std::string inputString)
786+
{
787+
size_t firstDotPosition = inputString.find('.');
788+
int converted = -1;
789+
790+
if (firstDotPosition != std::string::npos) {
791+
size_t secondDotPosition = inputString.find('.', firstDotPosition + 1);
792+
if (secondDotPosition != std::string::npos) {
793+
std::string numbersPortion = inputString.substr(firstDotPosition + 1, secondDotPosition - firstDotPosition - 1);
794+
try{converted = std::stoi(numbersPortion);}
795+
catch (const std::invalid_argument& e) {}
796+
catch (const std::out_of_range& e) {}
797+
}
798+
}
799+
return converted;
800+
}
801+
779802
bool llama_model_loader::load_all_data(
780803
struct ggml_context * ctx,
781804
llama_buf_map & bufs,
@@ -960,6 +983,16 @@ bool llama_model_loader::load_all_data(
960983
}
961984
}
962985

986+
#if defined(GGML_USE_CLBLAST)
987+
int layernum = layer_name_to_number(cur->name);
988+
bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum);
989+
if(shouldoffload)
990+
{
991+
cur->backend = GGML_BACKEND_TYPE_GPU;
992+
ggml_cl_transform_tensor(cur->data, cur);
993+
}
994+
#endif
995+
963996
size_done += n_size;
964997
}
965998

src/llama-model.cpp

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
#include <sstream>
1212
#include <stdexcept>
1313

14+
#if defined(GGML_USE_CLBLAST)
15+
# include "ggml_v3b-opencl.h"
16+
#endif
17+
1418
static const size_t kiB = 1024;
1519
static const size_t MiB = 1024*kiB;
1620
static const size_t GiB = 1024*MiB;
@@ -150,6 +154,9 @@ static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t d
150154
throw std::runtime_error(format("failed to create ggml context"));
151155
}
152156

157+
#if defined(GGML_USE_CLBLAST)
158+
ggml_cl_init();
159+
#endif
153160
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
154161
ggml_tensor * op_tensor = fn(ctx.get());
155162
for (int i = 0; i < GGML_MAX_SRC; i++) {
@@ -1153,6 +1160,16 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
11531160
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
11541161
for (int i = 0; i < n_merges; i++) {
11551162
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
1163+
if (!OldBPETokenizerMode)
1164+
{
1165+
auto validcodepoints = unicode_cpts_from_utf8(word).size() > 0;
1166+
GGML_ASSERT_CONTINUE(validcodepoints);
1167+
if(!validcodepoints)
1168+
{
1169+
OldBPETokenizerMode = true;
1170+
printf("\nFalling Back to older tokenizer...");
1171+
}
1172+
}
11561173
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
11571174

11581175
std::string first;
@@ -1398,10 +1415,13 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
13981415

13991416
for (uint32_t i = 0; i < n_vocab; i++) {
14001417
std::string word = gguf_get_arr_str(ctx, token_idx, i);
1401-
if (word.empty()) {
1418+
if (!OldBPETokenizerMode)
1419+
{
1420+
if (word.empty()) {
14021421
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
14031422
word = "[EMPTY_" + std::to_string(i) + "]";
14041423
}
1424+
}
14051425

14061426
vocab.token_to_id[word] = i;
14071427
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
@@ -1424,7 +1444,7 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
14241444
}
14251445
}
14261446
}
1427-
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
1447+
GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size());
14281448

14291449
vocab.init_tokenizer();
14301450

@@ -1681,8 +1701,8 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
16811701
} else {
16821702
// token is control, but not marked as EOG -> print a debug log
16831703
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
1684-
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
1685-
__func__, t.second, t.first.c_str());
1704+
// LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
1705+
// __func__, t.second, t.first.c_str());
16861706
}
16871707
}
16881708
}

src/llama-quant.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
394394
case GGML_TYPE_IQ1_M:
395395
case GGML_TYPE_Q2_K:
396396
case GGML_TYPE_Q3_K:
397-
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
397+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_Q4_0; break;
398398
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
399399
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
400400
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;

0 commit comments

Comments
 (0)