diff --git a/common/common.cpp b/common/common.cpp index 6dea8e3d25238..f3921ca2699d1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1418,8 +1418,9 @@ struct llama_model * common_load_model_from_url( int n_split = 0; { struct gguf_init_params gguf_params = { - /*.no_alloc = */ true, - /*.ctx = */ NULL, + /*.no_alloc = */ true, + /*.ctx = */ NULL, + /*.allow_byteswapping = */ true, }; auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params); if (!ctx_gguf) { @@ -2063,8 +2064,9 @@ static common_control_vector_data common_control_vector_load_one(const common_co ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ false, - /* .ctx = */ &ctx, + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + /* .allow_byteswapping = */ true, }; struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!ctx_gguf) { diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index bdf0eed2a9cd3..feea7f7cae5ab 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -533,8 +533,9 @@ static void load_vocab(const char * filename, const Config * config, struct my_l struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.allow_byteswapping = */ true, }; struct gguf_context * ctx = gguf_init_from_file(filename, params); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 91238e4beb26c..b0b3dd12d9bf3 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -48,8 +48,9 @@ static std::string ggml_ne_string(const ggml_tensor * t) { static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ ctx_ggml, + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, + /*.allow_byteswapping = */ true, }; struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); if (!ctx_gguf) { diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index 9523ec122f573..84298c4ca02b8 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -288,8 +288,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.allow_byteswapping = */ true, }; // xxh64 init diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index ef3ceb686f697..689efb1b5cc73 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -328,14 +328,20 @@ struct split_strategy { const char * t_name = gguf_get_tensor_name(ctx_out, i); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); auto n_bytes = ggml_nbytes(t); + auto n_elements = ggml_nelements(t) / ggml_blck_size(t->type); read_buf.resize(n_bytes); // calculate offset auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + ggml_byteswap_t byteswap_func = nullptr; + if (gguf_needs_byteswap(ctx_gguf)) { + byteswap_func = ggml_get_type_traits(t->type)->byteswap; + } + // copy tensor from input to output file - copy_file_to_file(f_input, fout, offset, n_bytes); + copy_file_to_file(f_input, fout, offset, n_bytes, n_elements, byteswap_func); zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); } @@ -346,13 +352,18 @@ struct split_strategy { } } - void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) { + void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len, const size_t elements, ggml_byteswap_t byteswap_func) { // TODO: detect OS and use copy_file_range() here for better performance if (read_buf.size() < len) { read_buf.resize(len); } f_in.seekg(in_offset); f_in.read((char *)read_buf.data(), len); + + if (byteswap_func != nullptr) { + byteswap_func(read_buf.data(), elements); + } + f_out.write((const char *)read_buf.data(), len); } }; @@ -361,8 +372,9 @@ static void gguf_split(const split_params & split_params) { struct ggml_context * ctx_meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + /*.allow_byteswapping = */ true, }; std::ifstream f_input(split_params.input.c_str(), std::ios::binary); @@ -426,8 +438,9 @@ static void gguf_merge(const split_params & split_params) { struct ggml_context * ctx_meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + /*.allow_byteswapping = */ true, }; if (i_split > 0) { @@ -541,6 +554,13 @@ static void gguf_merge(const split_params & split_params) { f_input.seekg(offset); f_input.read((char *)read_data.data(), n_bytes); + if (gguf_needs_byteswap(ctx_gguf)) { + auto byteswap = ggml_get_type_traits(t->type)->byteswap; + if (byteswap != nullptr) { + byteswap(read_data.data(), ggml_nelements(t) / ggml_blck_size(t->type)); + } + } + // write tensor data + padding fout.write((const char *)read_data.data(), n_bytes); zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c55c6..425734e54a54e 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -85,8 +85,9 @@ static bool gguf_ex_write(const std::string & fname) { // just read tensor info static bool gguf_ex_read_0(const std::string & fname) { struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, + /*.no_alloc = */ false, + /*.ctx = */ NULL, + /*.allow_byteswapping = */ true, }; struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); @@ -151,8 +152,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + /*.allow_byteswapping = */ true, }; struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 24073c5a9b15f..5b11e17134946 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1122,8 +1122,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { struct ggml_context * meta = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, + /*.no_alloc = */ true, + /*.ctx = */ &meta, + /*.allow_byteswapping = */ true, }; struct gguf_context * ctx = gguf_init_from_file(fname, params); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1198dc1fd9378..a1a8c962261e6 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2144,6 +2144,7 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_byteswap_t) ( void * GGML_RESTRICT buffer, size_t elements); struct ggml_type_traits { const char * type_name; @@ -2153,6 +2154,7 @@ extern "C" { bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float_ref; + ggml_byteswap_t byteswap; }; GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee202062b01..ca3b2bb8fa17a 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -74,6 +74,8 @@ extern "C" { // if not NULL, create a ggml_context and allocate the tensor data in it struct ggml_context ** ctx; + + bool allow_byteswapping; }; GGML_API struct gguf_context * gguf_init_empty(void); @@ -197,6 +199,9 @@ extern "C" { // writes the meta data to pointer "data" GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + // returns true if gguf file needs byteswapping when reading. byteswapping for writing not implemented + GGML_API bool gguf_needs_byteswap(const struct gguf_context * ctx); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index eab017889c919..51ca915311897 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -28,6 +28,14 @@ #include #endif +#if defined(__gnu_linux__) +#include +#else // defined(__gnu_linux__) +#define bswap_16(x) (x) +#define bswap_32(x) (x) +#define bswap_64(x) (x) +#endif // defined(__gnu_linux__) + #ifdef __cplusplus extern "C" { #endif @@ -553,12 +561,47 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) +// endianness conversion +static inline void ggml_bswap16(void * value) { + *((uint16_t*)value) = bswap_16(*((uint16_t*)value)); +} + +static inline void ggml_bswap32(void * value) { + *((uint32_t*)value) = bswap_32(*((uint32_t*)value)); +} + +static inline void ggml_bswap64(void * value) { + *((uint64_t*)value) = bswap_64(*((uint64_t*)value)); +} + #ifdef __cplusplus } #endif #ifdef __cplusplus #include +#include + +template = 0> +static inline void ggml_bswap(T * value) +{ + GGML_UNUSED(value); +} + +template = 0> +static inline void ggml_bswap(T * value) { + ggml_bswap16(value); +} + +template = 0> +static inline void ggml_bswap(T * value) { + ggml_bswap32(value); +} + +template = 0> +static inline void ggml_bswap(T * value) { + ggml_bswap64(value); +} // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b1d0d4913f8e1..677663c5eb123 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -561,6 +561,35 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); +static void ggml_byteswap_i16 (void * restrict buffer, size_t elements); +static void ggml_byteswap_i32 (void * restrict buffer, size_t elements); +static void ggml_byteswap_i64 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_1 (void * restrict buffer, size_t elements); +static void ggml_byteswap_q2_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q3_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q5_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q6_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_xxs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_xs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq3_xxs (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq3_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq2_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq1_s (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq4_nl (void * restrict buffer, size_t elements); +static void ggml_byteswap_iq4_xs (void * restrict buffer, size_t elements); +static void ggml_byteswap_q8_k (void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements); +static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements); +static void ggml_byteswap_tq1_0 (void * restrict buffer, size_t elements); +static void ggml_byteswap_tq2_0 (void * restrict buffer, size_t elements); + static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", @@ -573,30 +602,35 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(int16_t), .is_quantized = false, + .byteswap = ggml_byteswap_i16, }, [GGML_TYPE_I32] = { .type_name = "i32", .blck_size = 1, .type_size = sizeof(int32_t), .is_quantized = false, + .byteswap = ggml_byteswap_i32, }, [GGML_TYPE_I64] = { .type_name = "i64", .blck_size = 1, .type_size = sizeof(int64_t), .is_quantized = false, + .byteswap = ggml_byteswap_i64, }, [GGML_TYPE_F64] = { .type_name = "f64", .blck_size = 1, .type_size = sizeof(double), .is_quantized = false, + .byteswap = ggml_byteswap_i64, }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, .type_size = sizeof(float), .is_quantized = false, + .byteswap = ggml_byteswap_i32, }, [GGML_TYPE_F16] = { .type_name = "f16", @@ -605,6 +639,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .byteswap = ggml_byteswap_i16, }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -613,6 +648,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_0, .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref, + .byteswap = ggml_byteswap_q4_0, }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -621,6 +657,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_1, .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, + .byteswap = ggml_byteswap_q4_1, }, [4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -641,6 +678,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_0, .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref, + .byteswap = ggml_byteswap_q5_0, }, [GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -649,6 +687,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_1, .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref, + .byteswap = ggml_byteswap_q5_1, }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -657,6 +696,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q8_0, .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref, + .byteswap = ggml_byteswap_q8_0, }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -664,6 +704,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_q8_1), .is_quantized = true, .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref, + .byteswap = ggml_byteswap_q8_1, }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -672,6 +713,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_K, .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref, + .byteswap = ggml_byteswap_q2_k, }, [GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -680,6 +722,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q3_K, .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref, + .byteswap = ggml_byteswap_q3_k, }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -688,6 +731,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_K, .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref, + .byteswap = ggml_byteswap_q4_k, }, [GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -696,6 +740,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_K, .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref, + .byteswap = ggml_byteswap_q5_k, }, [GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -704,6 +749,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q6_K, .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref, + .byteswap = ggml_byteswap_q6_k, }, [GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -712,6 +758,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq2_xxs, }, [GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -720,6 +767,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq2_xs, }, [GGML_TYPE_IQ3_XXS] = { .type_name = "iq3_xxs", @@ -728,6 +776,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref, + .byteswap = ggml_byteswap_iq3_xxs, }, [GGML_TYPE_IQ3_S] = { .type_name = "iq3_s", @@ -736,6 +785,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq3_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref, + .byteswap = ggml_byteswap_iq3_s, }, [GGML_TYPE_IQ2_S] = { .type_name = "iq2_s", @@ -744,6 +794,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq2_s, .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref, + .byteswap = ggml_byteswap_iq2_s, }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", @@ -752,6 +803,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq1_s, .from_float_ref = NULL, + .byteswap = ggml_byteswap_iq1_s, }, [GGML_TYPE_IQ1_M] = { .type_name = "iq1_m", @@ -768,6 +820,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, + .byteswap = ggml_byteswap_iq4_nl, }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", @@ -776,12 +829,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref, + .byteswap = ggml_byteswap_iq4_xs, }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, .type_size = sizeof(block_q8_K), .is_quantized = true, + .byteswap = ggml_byteswap_q8_k, }, [GGML_TYPE_BF16] = { .type_name = "bf16", @@ -790,24 +845,28 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, + .byteswap = ggml_byteswap_i16, }, [31] = { // GGML_TYPE_Q4_0_4_4 .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_4x4, }, [32] = { // GGML_TYPE_Q4_0_4_8 .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_4x8, }, [33] = { // GGML_TYPE_Q4_0_8_8 .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", .blck_size = 0, .type_size = 0, .is_quantized = false, + .byteswap = ggml_byteswap_q4_0_8x8, }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", @@ -816,6 +875,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_tq1_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref, + .byteswap = ggml_byteswap_tq1_0, }, [GGML_TYPE_TQ2_0] = { .type_name = "tq2_0", @@ -824,6 +884,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, + .byteswap = ggml_byteswap_tq2_0, }, [36] = { // GGML_TYPE_IQ4_NL_4_4 .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking", @@ -6499,3 +6560,215 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons if (p0->strict_cpu != p1->strict_cpu ) return false; return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } + +static void ggml_byteswap_i16(void * restrict buffer, size_t elements) { + uint16_t *data_ptr = (uint16_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(data_ptr + i); + } +} + +static void ggml_byteswap_i32(void * restrict buffer, size_t elements) { + uint32_t *data_ptr = (uint32_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap32(data_ptr + i); + } +} + +static void ggml_byteswap_i64(void * restrict buffer, size_t elements) { + uint64_t *data_ptr = (uint64_t*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap64(data_ptr + i); + } +} + +static void ggml_byteswap_q4_0(void * restrict buffer, size_t elements) { + block_q4_0 *data_ptr = (block_q4_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_q4_1(void * restrict buffer, size_t elements) { + block_q4_1 *data_ptr = (block_q4_1*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + ggml_bswap16(&(data_ptr[i].m)); + } +} + +static void ggml_byteswap_q5_0(void * restrict buffer, size_t elements) { + block_q5_0 *data_ptr = (block_q5_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_q5_1(void * restrict buffer, size_t elements) { + block_q5_1 *data_ptr = (block_q5_1*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + ggml_bswap16(&(data_ptr[i].m)); + } +} + +static void ggml_byteswap_q8_0(void * restrict buffer, size_t elements) { + block_q8_0 *data_ptr = (block_q8_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_q8_1(void * restrict buffer, size_t elements) { + block_q8_1 *data_ptr = (block_q8_1*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + ggml_bswap16(&(data_ptr[i].s)); + } +} + +static void ggml_byteswap_q2_k(void * restrict buffer, size_t elements) { + block_q2_K *data_ptr = (block_q2_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + ggml_bswap16(&(data_ptr[i].dmin)); + } +} + +static void ggml_byteswap_q3_k(void * restrict buffer, size_t elements) { + block_q3_K *data_ptr = (block_q3_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_q4_k(void * restrict buffer, size_t elements) { + block_q4_K *data_ptr = (block_q4_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + ggml_bswap16(&(data_ptr[i].dmin)); + } +} + +static void ggml_byteswap_q5_k(void * restrict buffer, size_t elements) { + block_q5_K *data_ptr = (block_q5_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + ggml_bswap16(&(data_ptr[i].dmin)); + } +} + +static void ggml_byteswap_q6_k(void * restrict buffer, size_t elements) { + block_q6_K *data_ptr = (block_q6_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_iq2_xxs(void * restrict buffer, size_t elements) { + block_iq2_xxs *data_ptr = (block_iq2_xxs*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/8; ++j) { + ggml_bswap16(&(data_ptr[i].qs[j])); + } + } +} + +static void ggml_byteswap_iq2_xs(void * restrict buffer, size_t elements) { + block_iq2_xs *data_ptr = (block_iq2_xs*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/8; ++j) { + ggml_bswap16(&(data_ptr[i].qs[j])); + } + } +} + +static void ggml_byteswap_iq3_xxs(void * restrict buffer, size_t elements) { + block_iq3_xxs *data_ptr = (block_iq3_xxs*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_iq3_s(void * restrict buffer, size_t elements) { + block_iq3_s *data_ptr = (block_iq3_s*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_iq2_s(void * restrict buffer, size_t elements) { + block_iq2_s *data_ptr = (block_iq2_s*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_iq1_s(void * restrict buffer, size_t elements) { + block_iq1_s *data_ptr = (block_iq1_s*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/32; ++j) { + ggml_bswap16(&(data_ptr[i].qh[j])); + } + } +} + +static void ggml_byteswap_iq4_nl(void * restrict buffer, size_t elements) { + block_iq4_nl *data_ptr = (block_iq4_nl*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_iq4_xs(void * restrict buffer, size_t elements) { + block_iq4_xs *data_ptr = (block_iq4_xs*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + ggml_bswap16(&(data_ptr[i].scales_h)); + } +} + +static void ggml_byteswap_q8_k(void * restrict buffer, size_t elements) { + block_q8_K *data_ptr = (block_q8_K*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap32(&(data_ptr[i].d)); + for (size_t j = 0; j < QK_K/16; ++j) { + ggml_bswap16(&(data_ptr[i].bsums[j])); + } + } +} + +static void ggml_byteswap_q4_0_4x4(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x4 is not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_4x8(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "function ggml_byteswap_q4_0_4x8 is not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_q4_0_8x8(void * restrict buffer, size_t elements) { + GGML_ASSERT(false && "function ggml_byteswap_q4_0_8x8 is not implemented yet"); + UNUSED(buffer); + UNUSED(elements); +} + +static void ggml_byteswap_tq1_0(void * restrict buffer, size_t elements) { + block_tq1_0 *data_ptr = (block_tq1_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} + +static void ggml_byteswap_tq2_0(void * restrict buffer, size_t elements) { + block_tq2_0 *data_ptr = (block_tq2_0*) buffer; + for (size_t i = 0; i < elements; ++i) { + ggml_bswap16(&(data_ptr[i].d)); + } +} diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ab13669c567fe..d044ec726ec21 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -214,16 +214,22 @@ struct gguf_context { size_t size = 0; // size of `data` in bytes void * data = nullptr; + bool needs_byteswap = false; // only for reading, writing in non-native endianness is not supported }; struct gguf_reader { FILE * file; + bool do_byteswap = false; gguf_reader(FILE * file) : file(file) {} template bool read(T & dst) const { - return fread(&dst, 1, sizeof(dst), file) == sizeof(dst); + auto res = fread(&dst, 1, sizeof(dst), file); + if (do_byteswap) { + ggml_bswap(&dst); + } + return res == sizeof(dst); } template @@ -317,7 +323,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vectorversion)) { + if ((params.allow_byteswapping) && ((ctx->version & 0x0000FFFF) == 0) && ((ctx->version & 0xFFFF0000) != 0)) { + // most likely different endianness, do byteswapping + gr.do_byteswap = true; + ctx->needs_byteswap = true; + ggml_bswap(&(ctx->version)); + } + if (ctx->version == 1) { fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__); ok = false; @@ -1327,3 +1340,7 @@ void gguf_get_meta_data(const struct gguf_context * ctx, void * data) { gguf_write_to_buf(ctx, buf, /*only_meta =*/ true); memcpy(data, buf.data(), buf.size()); } + +bool gguf_needs_byteswap(const struct gguf_context * ctx) { + return ctx->needs_byteswap; +} diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 080d2b9dce5cb..7c94f45dfd4d8 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -4,6 +4,7 @@ import os import shutil import struct +import sys import tempfile from dataclasses import dataclass from enum import Enum, auto @@ -451,6 +452,12 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: for ti in tensors.values(): assert ti.tensor is not None # can only iterate once over the tensors assert ti.tensor.nbytes == ti.nbytes + + if (self.endianess == GGUFEndian.BIG and sys.byteorder == 'little') or \ + (self.endianess == GGUFEndian.LITTLE and sys.byteorder == 'big'): + # ti.tensor.byteswap(inplace=True) just didn't work here + ti.tensor = ti.tensor.byteswap() + ti.tensor.tofile(fout) if shard_bar is not None: shard_bar.update(ti.nbytes) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8a0800463137e..60eb8867fba7b 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -151,8 +151,9 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char ggml_context * ctx_init; struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &ctx_init, + /* .no_alloc = */ true, + /* .ctx = */ &ctx_init, + /* .allow_byteswapping = */ true, }; gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 75073bf610ac3..dc88d3687f897 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -460,8 +460,9 @@ llama_model_loader::llama_model_loader( // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + /*.allow_byteswapping = */ true, }; meta.reset(gguf_init_from_file(fname.c_str(), params)); @@ -520,8 +521,9 @@ llama_model_loader::llama_model_loader( const char * fname_split = splits[idx].c_str(); struct gguf_init_params split_params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + /*.allow_byteswapping = */ true, }; gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) }; if (!ctx_gguf) { @@ -679,6 +681,9 @@ llama_model_loader::llama_model_loader( if (!llama_mmap::SUPPORTED) { LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); use_mmap = false; + } else if (gguf_needs_byteswap(meta.get())) { + LLAMA_LOG_WARN("%s: gguf file needs byteswapping, mmap is disabled. This may impact performance.\n", __func__); + use_mmap = false; } this->use_mmap = use_mmap; @@ -869,6 +874,13 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { const auto & file = files.at(w.idx); file->seek(w.offs, SEEK_SET); file->read_raw(cur->data, ggml_nbytes(cur)); + + if (gguf_needs_byteswap(meta.get())) { + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); + } + } } if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { @@ -1024,6 +1036,14 @@ bool llama_model_loader::load_all_data( if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, n_size); + + if (gguf_needs_byteswap(meta.get())) { + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type)); + } + } + if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1036,11 +1056,22 @@ bool llama_model_loader::load_all_data( size_t bytes_read = 0; + // for byteswapping purposes ensure that there is whole number of elements in buffer + const size_t buf_size_aligned = gguf_needs_byteswap(meta.get()) ? buffer_size - (buffer_size % ggml_blck_size(cur->type)) : buffer_size; + while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + size_t read_iteration = std::min(buf_size_aligned, n_size - bytes_read); ggml_backend_event_synchronize(events[buffer_idx]); file->read_raw(host_ptrs[buffer_idx], read_iteration); + + if (gguf_needs_byteswap(meta.get())) { + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(host_ptrs[buffer_idx], read_iteration / ggml_blck_size(cur->type)); + } + } + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); ggml_backend_event_record(events[buffer_idx], upload_backend); @@ -1052,6 +1083,14 @@ bool llama_model_loader::load_all_data( read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), n_size); + + if (gguf_needs_byteswap(meta.get())) { + auto byteswap = ggml_get_type_traits(cur->type)->byteswap; + if (byteswap != nullptr) { + byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type)); + } + } + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); diff --git a/src/unicode.h b/src/unicode.h index c27098df7d4be..87b2ef7caa3cf 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -15,6 +15,10 @@ struct unicode_cpt_flags { SYMBOL = 0x0040, // regex: \p{S} CONTROL = 0x0080, // regex: \p{C} MASK_CATEGORIES = 0x00FF, + WHITESPACE = 0x0100, + LOWERCASE = 0x0200, + UPPERCASE = 0x0400, + NFD = 0x0800, }; // codepoint type @@ -34,11 +38,49 @@ struct unicode_cpt_flags { // decode from uint16 inline unicode_cpt_flags(const uint16_t flags = 0) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ *reinterpret_cast(this) = flags; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + is_undefined = (flags & UNDEFINED) ? 1 : 0; + is_number = (flags & NUMBER) ? 1 : 0; + is_letter = (flags & LETTER) ? 1 : 0; + is_separator = (flags & SEPARATOR) ? 1 : 0; + is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0; + is_punctuation = (flags & PUNCTUATION) ? 1 : 0; + is_symbol = (flags & SYMBOL) ? 1 : 0; + is_control = (flags & CONTROL) ? 1 : 0; + is_whitespace = (flags & WHITESPACE) ? 1 : 0; + is_lowercase = (flags & LOWERCASE) ? 1 : 0; + is_uppercase = (flags & UPPERCASE) ? 1 : 0; + is_nfd = (flags & NFD) ? 1 : 0; +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#error Unexpected or undefined __BYTE_ORDER__ +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ } inline uint16_t as_uint() const { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *reinterpret_cast(this); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + uint16_t result = + is_undefined * UNDEFINED + + is_number * NUMBER + + is_letter * LETTER + + is_separator * SEPARATOR + + is_accent_mark * ACCENT_MARK + + is_punctuation * PUNCTUATION + + is_symbol * SYMBOL + + is_control * CONTROL + + is_whitespace * WHITESPACE + + is_lowercase * LOWERCASE + + is_uppercase * UPPERCASE + + is_nfd * NFD + ; + + return result; +#else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#error Unexpected or undefined __BYTE_ORDER__ +#endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ } inline uint16_t category_flag() const { diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 6ed696328d71a..3ac04a6e9c2ea 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -707,8 +707,9 @@ static std::pair test_handcrafted_file(const unsigned int seed) { struct ggml_context * ctx = nullptr; struct gguf_init_params gguf_params = { - /*no_alloc =*/ false, - /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, + /*no_alloc =*/ false, + /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, + /*allow_byteswapping =*/ true, }; struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); @@ -1103,8 +1104,9 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned struct ggml_context * ctx_1 = nullptr; struct gguf_init_params gguf_params = { - /*no_alloc =*/ false, - /*ctx =*/ only_meta ? nullptr : &ctx_1, + /*no_alloc =*/ false, + /*ctx =*/ only_meta ? nullptr : &ctx_1, + /*allow_byteswapping =*/ true, }; struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);