Implement --no-byteswap argument to disable byteswapping on big endian platform

AlekseiNikiforovIBM · AlekseiNikiforovIBM · commit a9db9b0048f3 · 2025-01-22T14:21:00.000+01:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1438,6 +1438,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_mmap = false;
         }
     ).set_env("LLAMA_ARG_NO_MMAP"));
+    add_opt(common_arg(
+        {"--no-byteswap"},
+        "don't byteswap model data on big endian systems (use if model is byteswapped to big endian in advance)",
+        [](common_params & params) {
+            params.no_byteswap = true;
+        }
+    ).set_env("LLAMA_NO_BYTESWAP"));
     add_opt(common_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
diff --git a/common/common.cpp b/common/common.cpp
@@ -987,7 +987,7 @@ struct common_init_result common_init_from_params(common_params & params) {
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
         llama_adapter_lora_ptr lora;
-        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str(), mparams.no_byteswap));
         if (lora == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
@@ -1092,6 +1092,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
+    mparams.no_byteswap     = params.no_byteswap;
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
@@ -1418,8 +1419,9 @@ struct llama_model * common_load_model_from_url(
     int n_split = 0;
     {
         struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
+            /*.no_alloc    = */ true,
+            /*.ctx         = */ NULL,
+            /*.no_byteswap = */ false,
         };
         auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
         if (!ctx_gguf) {
@@ -2063,8 +2065,9 @@ static common_control_vector_data common_control_vector_load_one(const common_co
 
     ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
-        /* .no_alloc = */ false,
-        /* .ctx      = */ &ctx,
+        /* .no_alloc    = */ false,
+        /* .ctx         = */ &ctx,
+        /* .no_byteswap = */ false,
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
     if (!ctx_gguf) {
diff --git a/common/common.h b/common/common.h
@@ -307,6 +307,7 @@ struct common_params {
     bool no_kv_offload     = false; // disable KV offloading
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
+    bool no_byteswap       = false; // skip byteswapping on big endian systems
 
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -533,8 +533,9 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
         struct ggml_context * ctx_data = NULL;
 
         struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ &ctx_data,
+            /*.no_alloc    = */ false,
+            /*.ctx         = */ &ctx_data,
+            /*.no_byteswap = */ false,
         };
 
         struct gguf_context * ctx = gguf_init_from_file(filename, params);
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
@@ -48,8 +48,9 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
 
 static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
     struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ ctx_ggml,
+        /*.no_alloc    = */ true,
+        /*.ctx         = */ ctx_ggml,
+        /*.no_byteswap = */ false,
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
     if (!ctx_gguf) {
diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
@@ -288,8 +288,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
     struct ggml_context * ctx_data = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx_data,
+        /*.no_alloc    = */ false,
+        /*.ctx         = */ &ctx_data,
+        /*.no_byteswap = */ false,
     };
 
     // xxh64 init
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
@@ -361,8 +361,9 @@ static void gguf_split(const split_params & split_params) {
     struct ggml_context * ctx_meta = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx_meta,
+        /*.no_alloc    = */ true,
+        /*.ctx         = */ &ctx_meta,
+        /*.no_byteswap = */ false,
     };
 
     std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
@@ -426,8 +427,9 @@ static void gguf_merge(const split_params & split_params) {
         struct ggml_context * ctx_meta = NULL;
 
         struct gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &ctx_meta,
+            /*.no_alloc    = */ true,
+            /*.ctx         = */ &ctx_meta,
+            /*.no_byteswap = */ false,
         };
 
         if (i_split > 0) {
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
@@ -85,8 +85,9 @@ static bool gguf_ex_write(const std::string & fname) {
 // just read tensor info
 static bool gguf_ex_read_0(const std::string & fname) {
     struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ NULL,
+        /*.no_alloc    = */ false,
+        /*.ctx         = */ NULL,
+        /*.no_byteswap = */ false,
     };
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
@@ -151,8 +152,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
     struct ggml_context * ctx_data = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx_data,
+        /*.no_alloc    = */ false,
+        /*.ctx         = */ &ctx_data,
+        /*.no_byteswap = */ false,
     };
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -1122,8 +1122,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     struct ggml_context * meta = NULL;
 
     struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &meta,
+        /*.no_alloc    = */ true,
+        /*.ctx         = */ &meta,
+        /*.no_byteswap = */ false,
     };
 
     struct gguf_context * ctx = gguf_init_from_file(fname, params);
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
@@ -74,6 +74,8 @@ extern "C" {
 
         // if not NULL, create a ggml_context and allocate the tensor data in it
         struct ggml_context ** ctx;
+
+        bool no_byteswap;
     };
 
     GGML_API struct gguf_context * gguf_init_empty(void);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
@@ -218,13 +218,17 @@ struct gguf_context {
 
 struct gguf_reader {
     FILE * file;
+    bool no_byteswap = false;
 
     gguf_reader(FILE * file) : file(file) {}
+    gguf_reader(FILE * file, bool v_no_byteswap) : file(file), no_byteswap(v_no_byteswap) {}
 
     template <typename T>
     bool read(T & dst) const {
         auto res = fread(&dst, 1, sizeof(dst), file);
-        ggml_convert_from_le(&dst);
+        if (!no_byteswap) {
+            ggml_convert_from_le(&dst);
+        }
         return res == sizeof(dst);
     }
 
@@ -319,7 +323,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
 }
 
 struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
-    const struct gguf_reader gr(file);
+    const struct gguf_reader gr(file, params.no_byteswap);
     struct gguf_context * ctx = new gguf_context;
 
     bool ok = true;
@@ -1141,6 +1145,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
 
 struct gguf_writer {
     std::vector<int8_t> & buf;
+    bool no_byteswap = false;
 
     gguf_writer(std::vector<int8_t> & buf) : buf(buf) {}
 
@@ -1150,7 +1155,11 @@ struct gguf_writer {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
             buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            buf.push_back(reinterpret_cast<const int8_t *>(&val)[sizeof(val) - i - 1]);
+            if (!no_byteswap) {
+                buf.push_back(reinterpret_cast<const int8_t *>(&val)[sizeof(val) - i - 1]);
+            } else {
+                buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
+            }
 #else // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 #error Unexpected or undefined __BYTE_ORDER__
 #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
@@ -1321,7 +1330,7 @@ struct gguf_writer {
 
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
         auto byteswap = ggml_get_type_traits(info.t.type)->byteswap;
-        if (byteswap != nullptr) {
+        if (byteswap != nullptr && !no_byteswap) {
             byteswap(buf.data() + offset, ggml_nelements(&(info.t)) / ggml_blck_size(info.t.type));
         }
 #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
diff --git a/include/llama.h b/include/llama.h
@@ -304,6 +304,7 @@ extern "C" {
         bool use_mmap;      // use mmap if possible
         bool use_mlock;     // force system to keep model in RAM
         bool check_tensors; // validate model tensor data
+        bool no_byteswap;   // don't do byteswap, load pre-byteswapped big endian model on big endian system
     };
 
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -542,7 +543,8 @@ extern "C" {
     // Load a LoRA adapter from file
     LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
             struct llama_model * model,
-            const char * path_lora);
+            const char * path_lora,
+            bool no_byteswap);
 
     // Manually free a LoRA adapter
     // Note: loaded adapters will be free when the associated model is deleted
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
@@ -146,13 +146,14 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
     return nullptr;
 }
 
-static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter, bool no_byteswap) {
     LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx_init;
     struct gguf_init_params meta_gguf_params = {
-        /* .no_alloc = */ true,
-        /* .ctx      = */ &ctx_init,
+        /* .no_alloc    = */ true,
+        /* .ctx         = */ &ctx_init,
+        /* .no_byteswap = */ no_byteswap,
     };
 
     gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
@@ -327,11 +328,11 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
     LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
 
-struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
+struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora, bool no_byteswap) {
     struct llama_adapter_lora * adapter = new llama_adapter_lora();
 
     try {
-        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter, no_byteswap);
         return adapter;
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader(
         std::vector<std::string> & splits,
         bool use_mmap,
         bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p) {
+        const struct llama_model_kv_override * param_overrides_p,
+        bool no_byteswap) {
     int trace = 0;
     if (getenv("LLAMA_TRACE")) {
         trace = atoi(getenv("LLAMA_TRACE"));
@@ -460,8 +461,9 @@ llama_model_loader::llama_model_loader(
     // Load the main GGUF
     struct ggml_context * ctx = NULL;
     struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx,
+        /*.no_alloc    = */ true,
+        /*.ctx         = */ &ctx,
+        /*.no_byteswap = */ no_byteswap,
     };
 
     meta.reset(gguf_init_from_file(fname.c_str(), params));
@@ -520,8 +522,9 @@ llama_model_loader::llama_model_loader(
             const char * fname_split = splits[idx].c_str();
 
             struct gguf_init_params split_params = {
-                /*.no_alloc = */ true,
-                /*.ctx      = */ &ctx,
+                /*.no_alloc    = */ true,
+                /*.ctx         = */ &ctx,
+                /*.no_byteswap = */ no_byteswap,
             };
             gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
             if (!ctx_gguf) {
@@ -681,8 +684,9 @@ llama_model_loader::llama_model_loader(
         use_mmap = false;
     }
 
-    this->use_mmap = use_mmap;
+    this->use_mmap      = use_mmap;
     this->check_tensors = check_tensors;
+    this->no_byteswap   = no_byteswap;
 }
 
 std::string llama_model_loader::get_arch_name() const {
@@ -1027,7 +1031,7 @@ bool llama_model_loader::load_all_data(
 
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
                 auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
-                if (byteswap != nullptr) {
+                if (byteswap != nullptr && !no_byteswap) {
                     byteswap(cur->data, ggml_nelements(cur) / ggml_blck_size(cur->type));
                 }
 #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
@@ -1063,7 +1067,7 @@ bool llama_model_loader::load_all_data(
 
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
                     auto byteswap = ggml_get_type_traits(cur->type)->byteswap;
-                    if (byteswap != nullptr) {
+                    if (byteswap != nullptr && !no_byteswap) {
                         byteswap(read_buf.data(), read_buf.size() / ggml_blck_size(cur->type));
                     }
 #endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
@@ -70,6 +70,7 @@ struct llama_model_loader {
 
     bool use_mmap = false;
     bool check_tensors;
+    bool no_byteswap = false;
 
     llama_files files;
     llama_ftype ftype;
@@ -95,7 +96,8 @@ struct llama_model_loader {
         std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
         bool use_mmap,
         bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p);
+        const struct llama_model_kv_override * param_overrides_p,
+        bool no_byteswap);
 
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3768,6 +3768,7 @@ struct llama_model_params llama_model_default_params() {
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.check_tensors               =*/ false,
+        /*.no_byteswap                 =*/ false,
     };
 
 #ifdef GGML_USE_METAL
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, /*no_byteswap*/ false);
     ml.init_mappings(false); // no prefetching
 
     llama_model model(llama_model_default_params());
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
     model.t_start_us = tm.t_start_us;
 
     try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.no_byteswap);
 
         ml.print_info();
 
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp